Skip to content

Commit a978220

Browse files
committed
feat: enhance missing value handling for scoreset records
1 parent db9a54c commit a978220

3 files changed

Lines changed: 48 additions & 7 deletions

File tree

src/dcd_mapping/mavedb_data.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
MAVEDB_BASE_URL,
2828
authentication_header,
2929
http_download,
30+
is_missing_value,
3031
)
3132
from dcd_mapping.schemas import (
3233
ScoreRow,
@@ -246,13 +247,13 @@ def _load_scoreset_records(
246247
with path.open() as csvfile:
247248
reader = csv.DictReader(csvfile)
248249
for row in reader:
249-
if row["score"] == "NA":
250+
if is_missing_value(row["score"]):
250251
row["score"] = None
251252
else:
252253
row["score"] = row["score"]
253-
if row["hgvs_nt"] != "NA":
254+
if not is_missing_value(row["hgvs_nt"]):
254255
prefix = row["hgvs_nt"].split(":")[0] if ":" in row["hgvs_nt"] else None
255-
elif row["hgvs_pro"] != "NA":
256+
elif not is_missing_value(row["hgvs_pro"]):
256257
prefix = (
257258
row["hgvs_pro"].split(":")[0] if ":" in row["hgvs_pro"] else None
258259
)

src/dcd_mapping/resource_utils.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,25 @@
1010

1111
_logger = logging.getLogger(__name__)
1212

13+
# Common representations of missing/null data in CSV files
14+
MISSING_VALUE_REPRESENTATIONS = frozenset(
15+
{
16+
"NA",
17+
"N/A",
18+
"na",
19+
"n/a",
20+
"NaN",
21+
"nan",
22+
"null",
23+
"NULL",
24+
"None",
25+
"none",
26+
"",
27+
"-",
28+
".",
29+
}
30+
)
31+
1332
MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY")
1433
MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL")
1534
ENSEMBL_API_URL = os.environ.get("ENSEMBL_API_URL", "https://rest.ensembl.org") # TODO
@@ -24,6 +43,22 @@
2443
LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True)
2544

2645

46+
def is_missing_value(value: str | None) -> bool:
47+
"""Check if a value represents missing/null data.
48+
49+
This function recognizes multiple common representations of missing data
50+
that may appear in CSV files from external sources, making the codebase
51+
more resilient to upstream changes in NA representation.
52+
53+
:param value: The value to check
54+
:return: True if the value represents missing data, False otherwise
55+
"""
56+
if value is None:
57+
return True
58+
# Strip whitespace and check against known missing value representations
59+
return value.strip() in MISSING_VALUE_REPRESENTATIONS
60+
61+
2762
def authentication_header() -> dict | None:
2863
"""Fetch with api key envvar, if available."""
2964
return {"X-API-key": MAVEDB_API_KEY} if MAVEDB_API_KEY is not None else None

src/dcd_mapping/vrs_map.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
get_seqrepo,
3333
translate_hgvs_to_vrs,
3434
)
35-
from dcd_mapping.resource_utils import request_with_backoff
35+
from dcd_mapping.resource_utils import is_missing_value, request_with_backoff
3636
from dcd_mapping.schemas import (
3737
AlignmentResult,
3838
MappedScore,
@@ -378,7 +378,11 @@ def _map_protein_coding_pro(
378378
:param transcript: The transcript selection information for a score set
379379
:return: VRS mapping object if mapping succeeds
380380
"""
381-
if row.hgvs_pro in {"_wt", "_sy", "NA"} or len(row.hgvs_pro) == 3:
381+
if (
382+
row.hgvs_pro in {"_wt", "_sy"}
383+
or is_missing_value(row.hgvs_pro)
384+
or len(row.hgvs_pro) == 3
385+
):
382386
_logger.warning(
383387
"Can't process variant syntax %s for %s", row.hgvs_pro, row.accession
384388
)
@@ -700,7 +704,7 @@ def _hgvs_nt_is_valid(hgvs_nt: str) -> bool:
700704
:return: True if expression appears populated and valid
701705
"""
702706
return (
703-
(hgvs_nt != "NA")
707+
(not is_missing_value(hgvs_nt))
704708
and (hgvs_nt not in {"_wt", "_sy", "="})
705709
and (len(hgvs_nt) != 3)
706710
)
@@ -713,7 +717,8 @@ def _hgvs_pro_is_valid(hgvs_pro: str) -> bool:
713717
:return: True if expression appears populated and valid
714718
"""
715719
return (
716-
(hgvs_pro not in {"_wt", "_sy", "NA"})
720+
(hgvs_pro not in {"_wt", "_sy"})
721+
and (not is_missing_value(hgvs_pro))
717722
and (len(hgvs_pro) != 3)
718723
and ("fs" not in hgvs_pro)
719724
)

0 commit comments

Comments
 (0)