11from __future__ import annotations
22
3- from abc import ABCMeta , abstractmethod
4- from glob import glob
5- from io import StringIO
6- from itertools import zip_longest
7- from os import path
8- from re import compile
9- from functools import lru_cache
10- from typing import TextIO
3+ from abc import ABCMeta , abstractmethod
4+ from functools import lru_cache
5+ from glob import glob
6+ from io import StringIO
7+ from itertools import zip_longest
8+ from os import path
9+ from re import compile
10+ from typing import TextIO
1111
1212from numpy import nan
1313from pandas import DataFrame , Series , merge , read_csv
3535from .mapping import get_investigation_base_output , investigation_sections_mapping , study_sections_mapping
3636
3737
38- class ISATabReader :
38+ class ISATabReader :
3939 """A class to read an ISA-Tab investigation file into a dictionary of DataFrames
4040
4141 :param fp: A file-like buffer object of the investigation file
4242 """
4343
44- def __init__ (self , fp : TextIO ) -> None :
45- """Constructor for the ISATabReader class"""
46- self .source_file : TextIO = fp
47- self .__next_line : str | None = None
48- self .dataframe_dict : dict [str , DataFrame | str , list [DataFrame ]] = {}
49-
50- @property
51- def source_file (self ) -> TextIO :
52- """Getter for the source file-like buffer object
53-
54- :return: A file-like buffer object
55- """
56- return self .__source_file
57-
58- @source_file .setter
59- def source_file (self , fp : TextIO ) -> None :
60- """Setter for the source file property.
61-
62- :param fp: A file-like buffer object
63- """
64- self .__source_file = fp
65-
66- def __read_next_data_line (self ) -> str :
67- """Read the next non-comment line from the source file."""
68- while True :
69- line = self .source_file .readline ()
70- if not line :
71- return ""
72- if not line .lstrip ().startswith ("#" ):
73- return line
74-
75- def __peek (self ) -> str :
76- """Peek at the next line without moving to the next line. This function get the position of the next line,
77- reads the next line, then resets the file pointer to the original position
78-
79- :return: The next line past the current line
80- """
81- if self .__next_line is None :
82- self .__next_line = self .__read_next_data_line ()
83- return self .__next_line
84-
85- def __readline (self ) -> str :
86- """Read the next line, honoring the peek buffer."""
87- if self .__next_line is not None :
88- line = self .__next_line
89- self .__next_line = None
90- return line
91- return self .__read_next_data_line ()
44+ def __init__ (self , fp : TextIO ) -> None :
45+ """Constructor for the ISATabReader class"""
46+ self .source_file : TextIO = fp
47+ self .__next_line : str | None = None
48+ self .dataframe_dict : dict [str , DataFrame | str , list [DataFrame ]] = {}
49+
50+ @property
51+ def source_file (self ) -> TextIO :
52+ """Getter for the source file-like buffer object
53+
54+ :return: A file-like buffer object
55+ """
56+ return self .__source_file
57+
58+ @source_file .setter
59+ def source_file (self , fp : TextIO ) -> None :
60+ """Setter for the source file property.
61+
62+ :param fp: A file-like buffer object
63+ """
64+ self .__source_file = fp
65+
66+ def __read_next_data_line (self ) -> str :
67+ """Read the next non-comment line from the source file."""
68+ while True :
69+ line = self .source_file .readline ()
70+ if not line :
71+ return ""
72+ if not line .lstrip ().startswith ("#" ):
73+ return line
74+
75+ def __peek (self ) -> str :
76+ """Peek at the next line without moving to the next line. This function get the position of the next line,
77+ reads the next line, then resets the file pointer to the original position
78+
79+ :return: The next line past the current line
80+ """
81+ if self .__next_line is None :
82+ self .__next_line = self .__read_next_data_line ()
83+ return self .__next_line
84+
85+ def __readline (self ) -> str :
86+ """Read the next line, honoring the peek buffer."""
87+ if self .__next_line is not None :
88+ line = self .__next_line
89+ self .__next_line = None
90+ return line
91+ return self .__read_next_data_line ()
9292
9393 def __read_tab_section (self , sec_key : str , next_sec_key : str ) -> StringIO :
9494 """Slices a file by section delimited by section keys
@@ -97,19 +97,19 @@ def __read_tab_section(self, sec_key: str, next_sec_key: str) -> StringIO:
9797 :param next_sec_key: Delimiter key of end of section
9898 :return: A memory file of the section slice, as a string buffer object
9999 """
100- fileline : str = self .__readline ()
101- normed_line : str = fileline .rstrip ().strip ('"' )
102- memory_file : StringIO = StringIO ()
103-
104- if normed_line != sec_key :
105- raise IOError (f"Invalid ISA-Tab section order: expected '{ sec_key } ', got '{ normed_line } '" )
106- while self .__peek ().rstrip () != next_sec_key :
107- fileline = self .__readline ()
108- if not fileline :
109- break
110- memory_file .write (fileline .rstrip () + "\n " )
111- memory_file .seek (0 )
112- return memory_file
100+ fileline : str = self .__readline ()
101+ normed_line : str = fileline .rstrip ().strip ('"' )
102+ memory_file : StringIO = StringIO ()
103+
104+ if normed_line != sec_key :
105+ raise IOError (f"Invalid ISA-Tab section order: expected '{ sec_key } ', got '{ normed_line } '" )
106+ while self .__peek ().rstrip () != next_sec_key :
107+ fileline = self .__readline ()
108+ if not fileline :
109+ break
110+ memory_file .write (fileline .rstrip () + "\n " )
111+ memory_file .seek (0 )
112+ return memory_file
113113
114114 def __build_section_df (self , current_section_key : str , next_section_key : str ) -> DataFrame :
115115 """Reads a file section into a DataFrame
@@ -149,7 +149,7 @@ def run(self) -> dict[str, DataFrame | str, list[DataFrame]]:
149149 return output
150150
151151
152- class ISATabLoaderMixin (metaclass = ABCMeta ):
152+ class ISATabLoaderMixin (metaclass = ABCMeta ):
153153 """A mixin to provide modeling for the ISATab loaders. Provides shared methods, attributes and implementations
154154
155155 - Properties:
@@ -173,12 +173,12 @@ class ISATabLoaderMixin(metaclass=ABCMeta):
173173 skip_load_tables : bool
174174 filepath : str
175175
176- @staticmethod
177- @lru_cache (maxsize = 64 )
178- def _comment_columns (cols : tuple [str , ...]) -> tuple [str , ...]:
179- return tuple (x for x in cols if _RX_COMMENT .match (str (x )))
180-
181- def __get_ontology_source (self , term_source_ref ) -> OntologySource | None :
176+ @staticmethod
177+ @lru_cache (maxsize = 64 )
178+ def _comment_columns (cols : tuple [str , ...]) -> tuple [str , ...]:
179+ return tuple (x for x in cols if _RX_COMMENT .match (str (x )))
180+
181+ def __get_ontology_source (self , term_source_ref ) -> OntologySource | None :
182182 """Small wrapper to return an ontology source from the map or None if not found
183183
184184 :param term_source_ref: The term source reference
@@ -196,12 +196,12 @@ def get_contacts(self, contact_dataframe: DataFrame) -> list[Person]:
196196 contacts : list [Person ] = []
197197 prefix : str
198198
199- if "Investigation Person Last Name" in contact_dataframe .columns :
200- prefix = "Investigation "
201- elif "Study Person Last Name" in contact_dataframe .columns :
202- prefix = "Study "
203- else :
204- raise KeyError ("Could not resolve contact section prefix from DataFrame columns" )
199+ if "Investigation Person Last Name" in contact_dataframe .columns :
200+ prefix = "Investigation "
201+ elif "Study Person Last Name" in contact_dataframe .columns :
202+ prefix = "Study "
203+ else :
204+ raise KeyError ("Could not resolve contact section prefix from DataFrame columns" )
205205
206206 for current_row in contact_dataframe .to_dict (orient = "records" ):
207207 person : Person = Person (
@@ -225,32 +225,32 @@ def get_contacts(self, contact_dataframe: DataFrame) -> list[Person]:
225225 return contacts
226226
227227 @staticmethod
228- def get_comments (section_df : DataFrame ) -> list [Comment ]:
228+ def get_comments (section_df : DataFrame ) -> list [Comment ]:
229229 """Get Comments from a section DataFrame
230230
231231 :param section_df: A section DataFrame
232- :return: A list of Comment objects as found in the section
233- """
234- comments : list [Comment ] = []
235- comment_columns = ISATabLoaderMixin ._comment_columns (tuple (map (str , section_df .columns )))
236- for col in comment_columns :
237- for _ , current_row in section_df .iterrows ():
238- comments .append (Comment (name = next (iter (_RX_COMMENT .findall (col ))), value = current_row [col ]))
239- return comments
232+ :return: A list of Comment objects as found in the section
233+ """
234+ comments : list [Comment ] = []
235+ comment_columns = ISATabLoaderMixin ._comment_columns (tuple (map (str , section_df .columns )))
236+ for col in comment_columns :
237+ for _ , current_row in section_df .iterrows ():
238+ comments .append (Comment (name = next (iter (_RX_COMMENT .findall (col ))), value = current_row [col ]))
239+ return comments
240240
241241 @staticmethod
242- def get_comments_row (cols , row ) -> list [Comment ]:
242+ def get_comments_row (cols , row ) -> list [Comment ]:
243243 """Get Comments in a given DataFrame row
244244
245245 :param cols: List of DataFrame columns
246246 :param row: DataFrame row as a Series object
247- :return: A list of Comment objects
248- """
249- comments : list [Comment ] = []
250- comment_columns = ISATabLoaderMixin ._comment_columns (tuple (map (str , cols )))
251- for col in comment_columns :
252- comments .append (Comment (name = next (iter (_RX_COMMENT .findall (col ))), value = row [col ]))
253- return comments
247+ :return: A list of Comment objects
248+ """
249+ comments : list [Comment ] = []
250+ comment_columns = ISATabLoaderMixin ._comment_columns (tuple (map (str , cols )))
251+ for col in comment_columns :
252+ comments .append (Comment (name = next (iter (_RX_COMMENT .findall (col ))), value = row [col ]))
253+ return comments
254254
255255 def get_ontology_annotation (self , val , accession , ts_ref ) -> OntologyAnnotation | None :
256256 """Gets an OntologyAnnotation for a given value, accession and term source REF
@@ -294,12 +294,12 @@ def get_publications(self, section_df) -> list[Publication]:
294294 publications : list [Publication ] = []
295295 prefix : str
296296
297- if "Investigation PubMed ID" in section_df .columns :
298- prefix = "Investigation "
299- elif "Study PubMed ID" in section_df .columns :
300- prefix = "Study "
301- else :
302- raise KeyError ("Could not resolve publication section prefix from DataFrame columns" )
297+ if "Investigation PubMed ID" in section_df .columns :
298+ prefix = "Investigation "
299+ elif "Study PubMed ID" in section_df .columns :
300+ prefix = "Study "
301+ else :
302+ raise KeyError ("Could not resolve publication section prefix from DataFrame columns" )
303303
304304 for _ , current_row in section_df .iterrows ():
305305 publication : Publication = Publication (
0 commit comments