Skip to content

Commit 441eb04

Browse files
authored
Merge pull request #29 from VACLab/work-with-real-data
minor changes to make the tool work with real data
2 parents 6104097 + 4a296b4 commit 441eb04

4 files changed

Lines changed: 31 additions & 24 deletions

File tree

biasanalyzer/cohort.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from biasanalyzer.concept import ConceptHierarchy
1111
from biasanalyzer.config import load_cohort_creation_config
1212
from biasanalyzer.database import BiasDatabase, OMOPCDMDatabase
13-
from biasanalyzer.models import CohortDefinition
13+
from biasanalyzer.models import CohortDefinition, DOMAIN_MAPPING
1414
from biasanalyzer.utils import clean_string, hellinger_distance, notify_users
1515

1616

@@ -59,6 +59,9 @@ def get_concept_stats(
5959
"""
6060
Get cohort concept statistics such as concept prevalence
6161
"""
62+
if concept_type not in DOMAIN_MAPPING:
63+
raise ValueError(f'input concept_type {concept_type} is not a valid concept type to get concept stats')
64+
6265
cohort_stats = self.bias_db.get_cohort_concept_stats(
6366
self.cohort_id,
6467
self.query_builder,

biasanalyzer/database.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def _create_cohort_definition_table(self):
9999
def _create_cohort_table(self):
100100
self.conn.execute(f"""
101101
CREATE TABLE IF NOT EXISTS {self.schema}.cohort (
102-
subject_id BIGINT,
102+
subject_id VARCHAR NOT NULL,
103103
cohort_definition_id INTEGER,
104104
cohort_start_date DATE,
105105
cohort_end_date DATE,
@@ -288,12 +288,14 @@ def get_cohort_concept_stats(
288288
)
289289
concept_stats[concept_type] = self._execute_query(query)
290290
cs_df = pd.DataFrame(concept_stats[concept_type])
291-
# Combine concept_name and prevalence into a "details" column
292-
cs_df["details"] = cs_df.apply(
293-
lambda row: f"{row['concept_name']} (Code: {row['concept_code']}, "
294-
f"Count: {row['count_in_cohort']}, Prevalence: {row['prevalence']:.3%})",
295-
axis=1,
296-
)
291+
292+
if not cs_df.empty:
293+
# Combine concept_name and prevalence into a "details" column
294+
cs_df["details"] = cs_df.apply(
295+
lambda row: f"{row['concept_name']} (Code: {row['concept_code']}, "
296+
f"Count: {row['count_in_cohort']}, Prevalence: {row['prevalence']:.3%})",
297+
axis=1,
298+
)
297299

298300
if print_concept_hierarchy:
299301
filtered_cs_df = cs_df[cs_df["ancestor_concept_id"] != cs_df["descendant_concept_id"]]

tests/query_based/test_cohort_creation.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,10 @@ def test_cohort_creation_baseline(caplog, test_db):
8686

8787
patient_ids = set([item["subject_id"] for item in cohort.data])
8888
assert_equal(len(patient_ids), 5)
89-
assert_equal(patient_ids, {106, 108, 110, 111, 112})
89+
assert_equal(patient_ids, {'106', '108', '110', '111', '112'})
9090
# select two patients to check for cohort_start_date and cohort_end_date automatically computed
91-
patient_106 = next(item for item in cohort.data if item["subject_id"] == 106)
92-
patient_108 = next(item for item in cohort.data if item["subject_id"] == 108)
91+
patient_106 = next(item for item in cohort.data if item["subject_id"] == '106')
92+
patient_108 = next(item for item in cohort.data if item["subject_id"] == '108')
9393

9494
# Replace dates with actual values from your test data
9595
assert_equal(
@@ -127,7 +127,7 @@ def test_cohort_creation_study(test_db):
127127
assert cohort.data is not None, "Cohort creation wrongly returned None data"
128128
patient_ids = set([item["subject_id"] for item in cohort.data])
129129
assert_equal(len(patient_ids), 4)
130-
assert_equal(patient_ids, {108, 110, 111, 112})
130+
assert_equal(patient_ids, {'108', '110', '111', '112'})
131131

132132

133133
def test_cohort_creation_study2(caplog, test_db):
@@ -155,7 +155,7 @@ def test_cohort_creation_study2(caplog, test_db):
155155
assert cohort.data is not None, "Cohort creation wrongly returned None data"
156156
patient_ids = set([item["subject_id"] for item in cohort.data])
157157
assert_equal(len(patient_ids), 1)
158-
assert_equal(patient_ids, {106})
158+
assert_equal(patient_ids, {'106'})
159159

160160

161161
def test_cohort_creation_all(caplog, test_db):
@@ -191,7 +191,7 @@ def test_cohort_creation_all(caplog, test_db):
191191
patient_ids = set([item["subject_id"] for item in cohort.data])
192192
print(f"patient_ids: {patient_ids}", flush=True)
193193
assert_equal(len(patient_ids), 2)
194-
assert_equal(patient_ids, {108, 110})
194+
assert_equal(patient_ids, {'108', '110'})
195195

196196

197197
def test_cohort_creation_multiple_temporary_groups_with_no_operator(test_db):
@@ -214,7 +214,7 @@ def test_cohort_creation_multiple_temporary_groups_with_no_operator(test_db):
214214
patient_ids = set([item["subject_id"] for item in cohort.data])
215215
print(f"patient_ids: {patient_ids}", flush=True)
216216
assert_equal(len(patient_ids), 2)
217-
assert_equal(patient_ids, {108, 110})
217+
assert_equal(patient_ids, {'108', '110'})
218218

219219

220220
def test_cohort_creation_mixed_domains(test_db):
@@ -242,7 +242,7 @@ def test_cohort_creation_mixed_domains(test_db):
242242
patient_ids = set([item["subject_id"] for item in cohort.data])
243243
print(f"patient_ids: {patient_ids}", flush=True)
244244
assert_equal(len(patient_ids), 3)
245-
assert_equal(patient_ids, {1, 2, 6})
245+
assert_equal(patient_ids, {'1', '2', '6'})
246246
start_dates = [item["cohort_start_date"] for item in cohort.data]
247247
assert_equal(len(start_dates), 3)
248248
assert_equal(start_dates, [datetime.date(2020, 6, 1), datetime.date(2020, 6, 1), datetime.date(2018, 1, 1)])
@@ -356,10 +356,10 @@ def test_cohort_creation_negative_instance(test_db):
356356

357357
patient_ids = set([item["subject_id"] for item in cohort.data])
358358
assert_equal(len(patient_ids), 6) # Female patients 1, 2, 3, 5
359-
assert_equal(patient_ids, {1, 2, 3, 5, 6, 7})
359+
assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'})
360360

361361
# Verify dates for a specific patient (e.g., patient 1 with last diabetes diagnosis)
362-
patient_1 = next(item for item in cohort.data if item["subject_id"] == 1)
362+
patient_1 = next(item for item in cohort.data if item["subject_id"] == '1')
363363
assert_equal(
364364
patient_1["cohort_start_date"],
365365
datetime.date(2020, 6, 1),
@@ -392,10 +392,10 @@ def test_cohort_creation_offset(test_db):
392392

393393
patient_ids = set([item["subject_id"] for item in cohort.data])
394394
assert_equal(len(patient_ids), 6) # Female patients 1, 2, 3, 5
395-
assert_equal(patient_ids, {1, 2, 3, 5, 6, 7})
395+
assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'})
396396

397397
# Verify dates for a specific patient (e.g., patient 1 with offset)
398-
patient_1 = next(item for item in cohort.data if item["subject_id"] == 1)
398+
patient_1 = next(item for item in cohort.data if item["subject_id"] == '1')
399399
# Diabetes on 2020-06-01: -730 days = 2018-06-02, +180 days = 2020-11-28
400400
assert_equal(
401401
patient_1["cohort_start_date"],
@@ -435,10 +435,10 @@ def test_cohort_creation_negative_instance_offset(test_db):
435435

436436
patient_ids = set([item["subject_id"] for item in cohort.data])
437437
assert_equal(len(patient_ids), 6)
438-
assert_equal(patient_ids, {1, 2, 3, 5, 6, 7})
438+
assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'})
439439

440440
# Verify dates for a specific patient (e.g., patient 1 with last diabetes and offset)
441-
patient_1 = next(item for item in cohort.data if item["subject_id"] == 1)
441+
patient_1 = next(item for item in cohort.data if item["subject_id"] == '1')
442442
# Last diabetes on 2020-06-01: +180 days = 2020-11-28
443443
assert_equal(
444444
patient_1["cohort_start_date"],

tests/query_based/test_hierarchical_prevalence.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import pytest
2+
from numpy.ma.testutils import assert_equal
3+
24
from biasanalyzer.concept import ConceptHierarchy
35

46

@@ -25,8 +27,8 @@ def test_cohort_concept_hierarchical_prevalence(test_db, caplog):
2527
cohort.get_concept_stats(vocab="dummy_invalid_vocab")
2628

2729
# test the cohort does not have procedure_occurrence related concepts
28-
with pytest.raises(ValueError):
29-
cohort.get_concept_stats(concept_type="procedure_occurrence")
30+
cohort_stat, _ = cohort.get_concept_stats(concept_type="procedure_occurrence")
31+
assert_equal(cohort_stat, {'procedure_occurrence': []})
3032

3133
concept_stats, _ = cohort.get_concept_stats(vocab="ICD10CM", print_concept_hierarchy=True)
3234
assert concept_stats is not None, "Failed to fetch concept stats"

0 commit comments

Comments
 (0)