Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion R/clean_lab_main.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u
meta_unit_conv <- data.table::fread(lab_unit_conversion)
meta_thresholds <- data.table::fread(lab_thresholds)

requested_datasource <- datasource
if (requested_datasource != "" && "datasource" %in% names(meta_unit_conv)) {
meta_unit_conv <- meta_unit_conv[
is.na(datasource) | trimws(as.character(datasource)) == "" | datasource == requested_datasource
]
}

# If list_analyses is empty, use all concept_ids from target units
if (length(list_analyses) == 0) {
list_analyses <- unique(meta_target_units$concept_id)
Expand Down Expand Up @@ -97,7 +104,10 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u
# For compatibility with mo_convert, rename unit_origin to unit_matched
setnames(meta_cid_full, "unit_origin", "unit_matched", skip_absent = TRUE)
# Step 4: mo_convert with full metadata (unit conversion and thresholding)
dt_cid <- mo_convert(dt_cid, meta_cid_full)
dt_cid <- mo_convert(
dat_unit_matched = dt_cid,
metadata_convert = meta_cid_full
)
dt_cid[, value := data.table::fifelse(included == 1, value_converted, NA_real_)]
dt_cid[, unit_target := target_unit_cid]

Expand Down
31 changes: 17 additions & 14 deletions R/mo_convert.R
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@
list(success = FALSE, tried = 1L)
}

.mo_try_missing_chain <- function(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE) {
.mo_try_missing_chain <- function(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = NA_character_) {
# Returns list(success=TRUE/FALSE, tried=number_of_attempts_tried)
tried <- 0L
# explicit next_attempt chain (>0)
Expand Down Expand Up @@ -170,6 +170,13 @@
}
# rows labelled as missing
missing_rows <- attempts[unit_matched == "missing"]
if (!is.na(skip_assumed_unit) && nrow(missing_rows) > 0 && "assumed_unit_if_missing" %in% names(missing_rows)) {
missing_rows <- missing_rows[
is.na(next_attempt) |
next_attempt != 0 |
.mo_norm(assumed_unit_if_missing) != skip_assumed_unit
]
}
if (nrow(missing_rows) > 0) {
for (r in seq_len(nrow(missing_rows))) {
attempt_row <- missing_rows[r]
Expand Down Expand Up @@ -273,42 +280,38 @@ mo_convert <- function(dat_unit_matched, metadata_convert) {
missing_attempts_tried <- 0L
if (origin_other) {
row_unit_matched <- NA_character_
pref_res <- list(success = FALSE, tried = 0L)
if ("unit_matched" %in% names(dat)) row_unit_matched <- ifelse(is.na(dat$unit_matched[i]) || dat$unit_matched[i] == "", NA_character_, norm(dat$unit_matched[i]))
if (!is.na(row_unit_matched) && nrow(attempts) > 0) {
pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, row_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE)
missing_attempts_tried <- missing_attempts_tried + pref_res$tried
if (isTRUE(pref_res$success)) next
}
miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 1L, other_flow = TRUE)
skip_assumed_unit <- if (pref_res$tried > 0L) row_unit_matched else NA_character_
miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 1L, other_flow = TRUE, skip_assumed_unit = skip_assumed_unit)
missing_attempts_tried <- missing_attempts_tried + miss_res$tried
if (isTRUE(miss_res$success)) next
} else {
if (unit_missing_flag) {
row_unit_matched <- NA_character_
if ("unit_matched" %in% names(dat)) row_unit_matched <- ifelse(is.na(dat$unit_matched[i]) || dat$unit_matched[i] == "", NA_character_, norm(dat$unit_matched[i]))
pref_res <- list(success = FALSE, tried = 0L)
if (!is.na(row_unit_matched) && nrow(attempts) > 0) {
pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE)
missing_attempts_tried <- missing_attempts_tried + pref_res$tried
if (isTRUE(pref_res$success)) next
}
if (nrow(attempts) > 0) {
miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE)
# Only call the missing chain if prefill did not already cover the single attempt
# (i.e., prefill did not run, or there are chained attempts with next_attempt > 0 to pursue).
if (nrow(attempts) > 0 && (pref_res$tried == 0L || nrow(attempts[!is.na(next_attempt) & next_attempt > 0L]) > 0L)) {
skip_assumed_unit <- if (pref_res$tried > 0L) row_unit_matched else NA_character_
miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = skip_assumed_unit)
missing_attempts_tried <- missing_attempts_tried + miss_res$tried
if (isTRUE(miss_res$success)) next
}
}
}

# 2) If unit missing (or flagged missing), try attempts ordered by next_attempt (for assumed units)
if (unit_missing_flag && nrow(attempts) > 0) {
# consider explicit next_attempt chain (>0) first, then any rows that encode MISSING regardless of next_attempt
# Try the missing-attempts chain (includes explicit next_attempt chain
# and any rows labelled as MISSING).
miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE)
missing_attempts_tried <- missing_attempts_tried + miss_res$tried
if (isTRUE(miss_res$success)) next
}

# 3) Fallback: try all attempts ordered by next_attempt (1,2,...) as fallbacks for conversion
conv_success_current <- if (origin_other) 1L else if (unit_missing_flag) 3L else 1L
other_flow_current <- origin_other
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
person_id,concept_id,value_origin,unit_origin,included,value,unit_target,conversion,rule_applied
P01,LAB_HBA1C,6,%,1,42.08,mmol/mol,1,1
P02,LAB_HBA1C,42,mmol/mol,1,42,mmol/mol,0,0
P03,LAB_HBA1C,70,%,1,70,mmol/mol,1,1
P04,LAB_HBA1C,70,"",1,70,mmol/mol,3,0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
person_id,concept_id,value_origin,unit_origin,included,value,unit_target,conversion,rule_applied
P01,LAB_HBA1C,6.0,%,1,42.08,mmol/mol,1,1
P02,LAB_HBA1C,42,mmol/mol,1,42,mmol/mol,0,0
P03,LAB_HBA1C,70,%,0,,mmol/mol,1,90
P04,LAB_HBA1C,70,,0,,mmol/mol,3,91
2 changes: 2 additions & 0 deletions tests/data/Example 5/i_input/LAB_target_units.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
concept_id,unit_target
LAB_HBA1C,mmol/mol
2 changes: 2 additions & 0 deletions tests/data/Example 5/i_input/LAB_threshold.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
concept_id;Min;Max;unit_target;condition_on_variable;variable
LAB_HBA1C;9;174;mmol/mol;;
5 changes: 5 additions & 0 deletions tests/data/Example 5/i_input/LAB_unit_conversion.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
concept_id,datasource,unit_origin,unit_target,multiplication_factor_from_origin_to_target,conversion_not_multiplication,conversion_rate,condition_on_value,assumed_unit_if_missing,next_attempt
LAB_HBA1C,,%,"mmol/mol",,(10.93 * value) - 23.5,,,,0
LAB_HBA1C,DS_A,MISSING,"mmol/mol",,(10.93 * value) - 23.5,,,"%",0
LAB_HBA1C,DS_B,MISSING,"mmol/mol",1,,1,,"mmol/mol",1
LAB_HBA1C,DS_B,MISSING,"mmol/mol",,(10.93 * value) - 23.5,,,%,2
3 changes: 3 additions & 0 deletions tests/data/Example 5/i_input/LAB_unit_conversion_no_DS.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
concept_id,datasource,unit_origin,unit_target,multiplication_factor_from_origin_to_target,conversion_not_multiplication,conversion_rate,condition_on_value,assumed_unit_if_missing,next_attempt
LAB_HBA1C,,%,"mmol/mol",,(10.93 * value) - 23.5,,,,0
LAB_HBA1C,DS_A,MISSING,"mmol/mol",,(10.93 * value) - 23.5,,,"%",0
5 changes: 5 additions & 0 deletions tests/data/Example 5/i_input/dataset_lab_values.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
person_id,concept_id,value,unit
P01,LAB_HBA1C,6.0,%
P02,LAB_HBA1C,42,mmol/mol
P03,LAB_HBA1C,70,%
P04,LAB_HBA1C,70,
17 changes: 12 additions & 5 deletions tests/test_clean_lab_temp.R → tests/test_specific_example.R
Original file line number Diff line number Diff line change
@@ -1,32 +1,39 @@
# Test script for clean_lab_main
# This script will run clean_lab_main on all Example datasets and compare output to ground truth
# Test script for debugging
# This script will run CleanLabValuesDataset on an Example dataset, save the output and compare output to ground truth, if any

library(data.table)
# Load modular code
source("R/load_dependencies.R")
load_cleanlab()

examples <- c("Example 4")
examples <- c("Example 5")
base_path <- "tests/data"

for (ex in examples) {
cat("\nRunning test for", ex, "...\n")
input_dir <- file.path(base_path, ex, "i_input")
gt_dir <- file.path(base_path, ex, "i_ground_truth")

out_dir <- file.path(base_path, ex, "g_output")
dir.create(out_dir, showWarnings = F)

dataset_lab_values <- fread(file.path(input_dir, "dataset_lab_values.csv"))
path_lab_target_units <- file.path(input_dir, "LAB_target_units.csv")
path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion_wrong.csv")
path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion.csv")
# path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion_no_DS.csv")
path_lab_thresholds <- file.path(input_dir, "LAB_threshold.csv")

# Run cleaning
cleaned <- CleanLabValuesDataset(
dataset = dataset_lab_values,
datasource = "DS_A",
lab_target_units = path_lab_target_units,
lab_unit_conversion = path_unit_conversion,
lab_thresholds = path_lab_thresholds
)

# Save output
fwrite(cleaned, file.path(out_dir,"dataset_cleaned_output_lab_values.csv"))

# Load ground truth
gt_file <- file.path(gt_dir, "dataset_cleaned_lab_values.csv")
if (file.exists(gt_file)) {
Expand Down