Skip to content

Commit 00dda61

Browse files
authored
Merge pull request #171 from RECETOX/copilot/fix-na-value-handling
Do not treat NA values or empty cells as filled metadata
2 parents 6ab93d6 + 954b0b5 commit 00dda61

46 files changed

Lines changed: 880 additions & 495 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

MSMetaEnhancer/app.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515

1616
class Application:
17-
def __init__(self, log_level='info', log_file=None):
17+
def __init__(self, log_level="info", log_file=None):
1818
self.data = None
1919
logger.setup(log_level, log_file)
2020

@@ -25,12 +25,12 @@ def load_data(self, filename, file_format):
2525
:param filename: path to source spectra file
2626
:param file_format: format of spectra
2727
"""
28-
if file_format in ['msp', 'mgf', 'json']:
28+
if file_format in ["msp", "mgf", "json"]:
2929
self.data = Spectra()
30-
elif file_format in ['csv', 'tsv', 'tabular', 'xlsx']:
30+
elif file_format in ["csv", "tsv", "tabular", "xlsx"]:
3131
self.data = DataFrame()
3232
else:
33-
raise UnknownFileFormat(f'Format {file_format} not supported.')
33+
raise UnknownFileFormat(f"Format {file_format} not supported.")
3434
self.data.load_data(filename, file_format)
3535

3636
def save_data(self, filename, file_format):
@@ -51,12 +51,14 @@ def curate_metadata(self):
5151
curated_metadata = Curator().curate_metadata(self.data.get_metadata())
5252
self.data.fuse_metadata(curated_metadata)
5353

54-
async def annotate_spectra(self,
55-
converters,
56-
jobs=None,
57-
repeat: bool = False,
58-
monitor: Monitor = Monitor(),
59-
annotator: Annotator = Annotator()):
54+
async def annotate_spectra(
55+
self,
56+
converters,
57+
jobs=None,
58+
repeat: bool = False,
59+
monitor: Monitor = Monitor(),
60+
annotator: Annotator = Annotator(),
61+
):
6062
"""
6163
Annotates current Spectra data by specified jobs.
6264
@@ -72,9 +74,11 @@ async def annotate_spectra(self,
7274
async with aiohttp.ClientSession() as session:
7375
builder = ConverterBuilder()
7476
builder.validate_converters(converters)
75-
converters, web_converters = builder.build_converters(session, converters)
77+
compute_converters, web_converters = builder.build_converters(
78+
session, converters
79+
)
7680

77-
annotator.set_converters(converters)
81+
annotator.set_converters(compute_converters | web_converters)
7882
monitor.set_converters(web_converters)
7983

8084
# start converters status checker and wait for first status
@@ -86,16 +90,20 @@ async def annotate_spectra(self,
8690
if not jobs:
8791
jobs = []
8892
converter: Converter
89-
for converter in converters.values():
93+
for converter in annotator.converters.values():
9094
jobs += converter.get_conversion_functions()
9195
jobs = convert_to_jobs(jobs)
9296

9397
metadata_list = self.data.get_metadata()
9498

9599
logger.set_target_attributes(jobs, len(metadata_list))
96100

97-
results = await asyncio.gather(*[annotator.annotate(metadata, jobs, repeat)
98-
for metadata in metadata_list])
101+
results = await asyncio.gather(
102+
*[
103+
annotator.annotate(metadata, jobs, repeat)
104+
for metadata in metadata_list
105+
]
106+
)
99107
finally:
100108
monitor.join()
101109

MSMetaEnhancer/libs/Annotator.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,21 @@
22

33
from MSMetaEnhancer.libs.Curator import Curator
44
from MSMetaEnhancer.libs.utils import logger
5-
from MSMetaEnhancer.libs.utils.Errors import TargetAttributeNotRetrieved, SourceAttributeNotAvailable, \
6-
ServiceNotAvailable, UnknownResponse, DataAlreadyPresent
5+
from MSMetaEnhancer.libs.utils.Errors import (
6+
TargetAttributeNotRetrieved,
7+
SourceAttributeNotAvailable,
8+
ServiceNotAvailable,
9+
UnknownResponse,
10+
DataAlreadyPresent,
11+
)
712
from MSMetaEnhancer.libs.utils.Logger import LogRecord
813

914

1015
class Annotator:
1116
"""
1217
Annotator is responsible for annotation process of single spectra.
1318
"""
19+
1420
def __init__(self):
1521
self.converters = dict()
1622
self.curator = Curator()
@@ -41,17 +47,28 @@ async def annotate(self, metadata, jobs, repeat=False):
4147
for job in jobs:
4248
if job.target not in metadata:
4349
try:
44-
metadata, cache = await self.execute_job_with_cache(job, metadata, cache, log)
50+
metadata, cache = await self.execute_job_with_cache(
51+
job, metadata, cache, log
52+
)
4553
if repeat:
4654
added_metadata = True
47-
except (SourceAttributeNotAvailable, TargetAttributeNotRetrieved) as exc:
55+
except (
56+
SourceAttributeNotAvailable,
57+
TargetAttributeNotRetrieved,
58+
) as exc:
4859
log.update(exc, job, level=3)
4960
except (ServiceNotAvailable, UnknownResponse) as exc:
5061
log.update(exc, job, level=2)
5162
except Exception:
5263
log.update(Exception(traceback.format_exc()), job, level=1)
5364
else:
54-
log.update(DataAlreadyPresent(f'Requested attribute {job.target} already present.'), job, level=2)
65+
log.update(
66+
DataAlreadyPresent(
67+
f"Requested attribute {job.target} already present."
68+
),
69+
job,
70+
level=2,
71+
)
5572

5673
logger.add_logs(log)
5774
logger.add_coverage_after(metadata.keys())
@@ -85,7 +102,7 @@ async def execute_job_with_cache(self, job, metadata, cache, warning):
85102
if job.target in cache[job.converter]:
86103
metadata[job.target] = cache[job.converter][job.target]
87104
else:
88-
raise TargetAttributeNotRetrieved('No data retrieved.')
105+
raise TargetAttributeNotRetrieved("No data retrieved.")
89106
else:
90-
raise ServiceNotAvailable(f'Service {job.converter} not available.')
107+
raise ServiceNotAvailable(f"Service {job.converter} not available.")
91108
return metadata, cache

MSMetaEnhancer/libs/Converter.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ class Converter(ABC):
55
"""
66
General class for conversions.
77
"""
8+
89
def __init__(self):
910
self.is_available = True
1011

@@ -46,13 +47,15 @@ def get_conversion_functions(self) -> list:
4647
:return: a list of available conversion functions
4748
"""
4849
available_conversions = []
49-
methods = [method_name for method_name in dir(self) if '_to_' in method_name]
50+
methods = [method_name for method_name in dir(self) if "_to_" in method_name]
5051
for method in methods:
51-
available_conversions.append((*method.split('_to_'), self.converter_name))
52+
available_conversions.append((*method.split("_to_"), self.converter_name))
5253
return available_conversions
5354

5455

55-
def create_top_level_method(obj: Converter, source: str, target: str, method: str, asynch: bool = True):
56+
def create_top_level_method(
57+
obj: Converter, source: str, target: str, method: str, asynch: bool = True
58+
):
5659
"""
5760
Assign a new method to {obj} called {source}_to_{target} which calls {method}.
5861
@@ -62,14 +65,15 @@ def create_top_level_method(obj: Converter, source: str, target: str, method: st
6265
:param method: method which is called in the object with single argument
6366
:param asynch: whether to create asynchronous methods
6467
"""
68+
6569
async def async_conversion(key):
6670
return await getattr(obj, str(method))(key)
6771

6872
def sync_conversion(key):
6973
return getattr(obj, str(method))(key)
7074

71-
doc = f'Convert {source} to {target} using {obj.__class__.__name__} converter'
72-
name = f'{source}_to_{target}'
75+
doc = f"Convert {source} to {target} using {obj.__class__.__name__} converter"
76+
name = f"{source}_to_{target}"
7377

7478
if asynch:
7579
async_conversion.__doc__ = doc

MSMetaEnhancer/libs/Curator.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from matchms.filtering.filter_utils.smile_inchi_inchikey_conversions import (
2-
is_valid_smiles, is_valid_inchi, is_valid_inchikey
2+
is_valid_smiles,
3+
is_valid_inchi,
4+
is_valid_inchikey,
35
)
46
from MSMetaEnhancer.libs.utils.Errors import InvalidAttributeFormat
57

@@ -10,7 +12,7 @@
1012
inchikey = "VNWKTOKETHGBQD-UHFFFAOYSA-N"
1113

1214
print(is_valid_smiles(smiles)) # True if valid SMILES
13-
print(is_valid_inchi(inchi)) # True if valid InChI
15+
print(is_valid_inchi(inchi)) # True if valid InChI
1416
print(is_valid_inchikey(inchikey)) # True if valid InChIKey
1517

1618

@@ -21,6 +23,7 @@ class Curator:
2123
2224
Additionally, it supports metadata validation to make sure the produced data are correct.
2325
"""
26+
2427
def curate_metadata(self, metadata_list):
2528
"""
2629
Iterates over given metadata and curates individual entries.
@@ -40,8 +43,8 @@ def curate_casno(self, metadata):
4043
:param metadata: given metadata
4144
:return: curated metadata
4245
"""
43-
if 'casno' in metadata:
44-
metadata['casno'] = self.fix_cas_number(metadata['casno'])
46+
if "casno" in metadata:
47+
metadata["casno"] = self.fix_cas_number(metadata["casno"])
4548
return metadata
4649

4750
@staticmethod
@@ -54,7 +57,7 @@ def fix_cas_number(cas_number):
5457
"""
5558
if isinstance(cas_number, str):
5659
if "-" not in cas_number:
57-
return f'{cas_number[:-3]}-{cas_number[-3:-1]}-{cas_number[-1]}'
60+
return f"{cas_number[:-3]}-{cas_number[-3:-1]}-{cas_number[-1]}"
5861
return cas_number
5962

6063
@staticmethod
@@ -68,20 +71,26 @@ def filter_invalid_metadata(metadata, log, job):
6871
:return: only valid metadata
6972
"""
7073
filters = {
71-
'smiles': is_valid_smiles,
72-
'canonical_smiles': is_valid_smiles,
73-
'isomeric_smiles': is_valid_smiles,
74-
'inchi': is_valid_inchi,
75-
'inchikey': is_valid_inchikey
74+
"smiles": is_valid_smiles,
75+
"canonical_smiles": is_valid_smiles,
76+
"isomeric_smiles": is_valid_smiles,
77+
"inchi": is_valid_inchi,
78+
"inchikey": is_valid_inchikey,
7679
}
7780

7881
valid_metadata = {}
79-
for (attribute, value) in metadata.items():
82+
for attribute, value in metadata.items():
8083
if attribute in filters.keys():
8184
if filters[attribute](value):
8285
valid_metadata[attribute] = value
8386
else:
84-
log.update(InvalidAttributeFormat(f'Obtained {attribute} in invalid format: {value}'), job, level=2)
87+
log.update(
88+
InvalidAttributeFormat(
89+
f"Obtained {attribute} in invalid format: {value}"
90+
),
91+
job,
92+
level=2,
93+
)
8594
else:
8695
valid_metadata[attribute] = value
8796
return valid_metadata

MSMetaEnhancer/libs/converters/compute/ComputeConverter.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@ class ComputeConverter(Converter):
55
"""
66
General class for computation conversion.
77
"""
8+
89
async def convert(self, source, target, data):
9-
return getattr(self, f'{source}_to_{target}')(data)
10+
return getattr(self, f"{source}_to_{target}")(data)

MSMetaEnhancer/libs/converters/compute/RDKit.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,15 @@ class RDKit(ComputeConverter):
1313
"""
1414
RDKit is a collection of chemo-informatics and machine-learning software.
1515
"""
16+
1617
def __init__(self):
1718
super().__init__()
1819
# generate top level methods defining allowed conversions
19-
conversions = [('smiles', 'mw', 'from_smiles'),
20-
('canonical_smiles', 'mw', 'from_smiles'),
21-
('isomeric_smiles', 'mw', 'from_smiles')]
20+
conversions = [
21+
("smiles", "mw", "from_smiles"),
22+
("canonical_smiles", "mw", "from_smiles"),
23+
("isomeric_smiles", "mw", "from_smiles"),
24+
]
2225
self.create_top_level_conversion_methods(conversions, asynch=False)
2326

2427
def from_smiles(self, smiles):
@@ -29,7 +32,7 @@ def from_smiles(self, smiles):
2932
:return: computed molecular weight
3033
"""
3134
weight = ExactMolWt(MolFromSmiles(smiles))
32-
return {'mw': weight}
35+
return {"mw": weight}
3336

3437
def inchi_to_canonical_smiles(self, inchi):
3538
"""
@@ -39,7 +42,7 @@ def inchi_to_canonical_smiles(self, inchi):
3942
:return: computed canonical SMILES
4043
"""
4144
smiles = MolToSmiles(MolFromInchi(inchi), isomericSmiles=False)
42-
return {'canonical_smiles': smiles}
45+
return {"canonical_smiles": smiles}
4346

4447
def inchi_to_isomeric_smiles(self, inchi):
4548
"""
@@ -49,7 +52,7 @@ def inchi_to_isomeric_smiles(self, inchi):
4952
:return: computed isomeric SMILES
5053
"""
5154
smiles = MolToSmiles(MolFromInchi(inchi))
52-
return {'isomeric_smiles': smiles}
55+
return {"isomeric_smiles": smiles}
5356

5457
def formula_to_mw(self, formula):
5558
"""
@@ -66,9 +69,13 @@ def formula_to_mw(self, formula):
6669
continue
6770

6871
atom = Atom(parts[index])
69-
multiplier = int(parts[index + 1]) if len(parts) > index + 1 and parts[index + 1].isnumeric() else 1
72+
multiplier = (
73+
int(parts[index + 1])
74+
if len(parts) > index + 1 and parts[index + 1].isnumeric()
75+
else 1
76+
)
7077
mass += atom.GetMass() * multiplier
71-
return {'mw': mass}
78+
return {"mw": mass}
7279

7380
def smiles_to_formula(self, smiles: str) -> dict:
7481
"""
@@ -79,11 +86,11 @@ def smiles_to_formula(self, smiles: str) -> dict:
7986
"""
8087
mol = MolFromSmiles(smiles)
8188
if mol is None:
82-
return {'formula': ''}
89+
return {"formula": ""}
8390

8491
formula = CalcMolFormula(mol)
8592

86-
return {'formula': formula}
93+
return {"formula": formula}
8794

8895
def inchi_to_formula(self, inchi: str) -> dict:
8996
"""
@@ -94,6 +101,6 @@ def inchi_to_formula(self, inchi: str) -> dict:
94101
"""
95102
mol = MolFromInchi(inchi)
96103
if mol is None:
97-
return {'formula': ''}
104+
return {"formula": ""}
98105
formula = CalcMolFormula(mol)
99-
return {'formula': formula}
106+
return {"formula": formula}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
from MSMetaEnhancer.libs.converters.compute.RDKit import RDKit
22

3-
__all__ = ['RDKit']
3+
__all__ = ["RDKit"]

0 commit comments

Comments
 (0)