Skip to content

Commit fbe8558

Browse files
Copilothechth
andcommitted
Filter NA/empty values from metadata when reading tabular and MSP files
Co-authored-by: hechth <12066490+hechth@users.noreply.github.com>
1 parent e93e5e3 commit fbe8558

6 files changed

Lines changed: 76 additions & 2 deletions

File tree

MSMetaEnhancer/libs/data/DataFrame.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from MSMetaEnhancer.libs.data.Data import Data
44
from MSMetaEnhancer.libs.utils.Errors import UnknownFileFormat
5+
from MSMetaEnhancer.libs.utils.Generic import is_na_value
56

67

78
class DataFrame(Data):
@@ -45,7 +46,8 @@ def save_data(self, filename: str, file_format: str):
4546
raise UnknownFileFormat(f'Format {file_format} not supported.')
4647

4748
def get_metadata(self):
48-
return self.df.to_dict('records')
49+
records = self.df.to_dict('records')
50+
return [{k: v for k, v in record.items() if not is_na_value(v)} for record in records]
4951

5052
def fuse_metadata(self, metadata_list):
5153
self.df = pandas.DataFrame.from_dict(metadata_list)

MSMetaEnhancer/libs/data/Spectra.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from MSMetaEnhancer.libs.data.Data import Data
77
from MSMetaEnhancer.libs.utils.Errors import UnknownFileFormat
8+
from MSMetaEnhancer.libs.utils.Generic import is_na_value
89

910

1011
class Spectra(Data):
@@ -48,7 +49,8 @@ def save_data(self, filename: str, file_format: str):
4849
raise UnknownFileFormat(f'Format {file_format} not supported.')
4950

5051
def get_metadata(self):
51-
return [spectra.metadata for spectra in self.spectrums]
52+
return [{k: v for k, v in spectra.metadata.items() if not is_na_value(v)}
53+
for spectra in self.spectrums]
5254

5355
def fuse_metadata(self, metadata):
5456
for i in range(len(metadata)):

MSMetaEnhancer/libs/utils/Generic.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,20 @@
1+
import math
2+
3+
4+
NA_STRING_VALUES = {'na', 'n/a', 'nan', 'none', ''}
5+
6+
7+
def is_na_value(value) -> bool:
8+
"""Check if a value should be treated as NA/missing (e.g. empty, None, NaN, 'NA')."""
9+
if value is None:
10+
return True
11+
if isinstance(value, float) and math.isnan(value):
12+
return True
13+
if isinstance(value, str) and value.strip().lower() in NA_STRING_VALUES:
14+
return True
15+
return False
16+
17+
118
def escape_single_quotes(f):
219
async def wrapper(self, arg):
320
return await f(self, arg.replace("'", "\\'"))
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
formula,mw,casno,inchikey,smiles
2+
H2,2,1333740,NA,
3+
D2,4,7782390,nan,None
4+
CH4,16,74828,N/A,n/a

tests/test_data/sample_with_na.msp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
NAME: Hydrogen
2+
FORMULA: H2
3+
MW: 2
4+
INCHIKEY: NA
5+
SMILES: n/a
6+
NUM PEAKS: 2
7+
1.0 20.98
8+
2.0 999.0
9+
10+
NAME: Deuterium
11+
FORMULA: D2
12+
MW: 4
13+
INCHIKEY: nan
14+
SMILES: None
15+
NUM PEAKS: 2
16+
2.0 14.99
17+
4.0 999.0
18+
19+
NAME: Methane
20+
FORMULA: CH4
21+
MW: 16
22+
INCHIKEY: N/A
23+
SMILES:
24+
NUM PEAKS: 6
25+
12.0 37.97
26+
13.0 105.9
27+
14.0 203.82
28+
15.0 886.2
29+
16.0 999.0
30+
17.0 15.99

tests/test_io.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,22 @@ def test_tabular_data():
8383
assert meta_item[key] == data_item[key], (
8484
f"Value mismatch for key '{key}' at index {i}: {meta_item[key]} != {data_item[key]}"
8585
)
86+
87+
88+
@pytest.mark.parametrize('backend, file_type, filename, absent_keys', [
89+
[DataFrame(), 'csv', 'tests/test_data/sample_metadata_with_na.csv', ['inchikey', 'smiles']],
90+
[Spectra(), 'msp', 'tests/test_data/sample_with_na.msp', ['inchikey', 'smiles']],
91+
])
92+
def test_na_values_filtered_from_metadata(backend, file_type, filename, absent_keys):
93+
"""NA and empty values in data files should be excluded from metadata dicts."""
94+
backend.load_data(filename, file_type)
95+
metadata = backend.get_metadata()
96+
97+
assert len(metadata) == 3
98+
99+
for i, meta_item in enumerate(metadata):
100+
# Keys that had NA values must be absent
101+
for key in absent_keys:
102+
assert key not in meta_item, (
103+
f"NA key '{key}' should not be present at index {i}, got {meta_item.get(key)}"
104+
)

0 commit comments

Comments
 (0)