diff --git a/R/clean_lab_main.R b/R/clean_lab_main.R index 8489980..21a5451 100644 --- a/R/clean_lab_main.R +++ b/R/clean_lab_main.R @@ -22,6 +22,13 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u meta_unit_conv <- data.table::fread(lab_unit_conversion) meta_thresholds <- data.table::fread(lab_thresholds) + requested_datasource <- datasource + if (requested_datasource != "" && "datasource" %in% names(meta_unit_conv)) { + meta_unit_conv <- meta_unit_conv[ + is.na(datasource) | trimws(as.character(datasource)) == "" | datasource == requested_datasource + ] + } + # If list_analyses is empty, use all concept_ids from target units if (length(list_analyses) == 0) { list_analyses <- unique(meta_target_units$concept_id) @@ -97,7 +104,10 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u # For compatibility with mo_convert, rename unit_origin to unit_matched setnames(meta_cid_full, "unit_origin", "unit_matched", skip_absent = TRUE) # Step 4: mo_convert with full metadata (unit conversion and thresholding) - dt_cid <- mo_convert(dt_cid, meta_cid_full) + dt_cid <- mo_convert( + dat_unit_matched = dt_cid, + metadata_convert = meta_cid_full + ) dt_cid[, value := data.table::fifelse(included == 1, value_converted, NA_real_)] dt_cid[, unit_target := target_unit_cid] diff --git a/R/mo_convert.R b/R/mo_convert.R index 1fcbd98..6a22742 100644 --- a/R/mo_convert.R +++ b/R/mo_convert.R @@ -141,7 +141,7 @@ list(success = FALSE, tried = 1L) } -.mo_try_missing_chain <- function(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE) { +.mo_try_missing_chain <- function(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = NA_character_) { # Returns list(success=TRUE/FALSE, tried=number_of_attempts_tried) tried <- 0L # explicit next_attempt chain (>0) @@ -170,6 +170,13 @@ } # rows labelled as missing missing_rows <- attempts[unit_matched == "missing"] + if (!is.na(skip_assumed_unit) && nrow(missing_rows) > 0 && "assumed_unit_if_missing" %in% names(missing_rows)) { + missing_rows <- missing_rows[ + is.na(next_attempt) | + next_attempt != 0 | + .mo_norm(assumed_unit_if_missing) != skip_assumed_unit + ] + } if (nrow(missing_rows) > 0) { for (r in seq_len(nrow(missing_rows))) { attempt_row <- missing_rows[r] @@ -273,42 +280,38 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { missing_attempts_tried <- 0L if (origin_other) { row_unit_matched <- NA_character_ + pref_res <- list(success = FALSE, tried = 0L) if ("unit_matched" %in% names(dat)) row_unit_matched <- ifelse(is.na(dat$unit_matched[i]) || dat$unit_matched[i] == "", NA_character_, norm(dat$unit_matched[i])) if (!is.na(row_unit_matched) && nrow(attempts) > 0) { pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, row_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE) missing_attempts_tried <- missing_attempts_tried + pref_res$tried if (isTRUE(pref_res$success)) next } - miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 1L, other_flow = TRUE) + skip_assumed_unit <- if (pref_res$tried > 0L) row_unit_matched else NA_character_ + miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 1L, other_flow = TRUE, skip_assumed_unit = skip_assumed_unit) missing_attempts_tried <- missing_attempts_tried + miss_res$tried if (isTRUE(miss_res$success)) next } else { if (unit_missing_flag) { row_unit_matched <- NA_character_ if ("unit_matched" %in% names(dat)) row_unit_matched <- ifelse(is.na(dat$unit_matched[i]) || dat$unit_matched[i] == "", NA_character_, norm(dat$unit_matched[i])) + pref_res <- list(success = FALSE, tried = 0L) if (!is.na(row_unit_matched) && nrow(attempts) > 0) { pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) missing_attempts_tried <- missing_attempts_tried + pref_res$tried if (isTRUE(pref_res$success)) next } - if (nrow(attempts) > 0) { - miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE) + # Only call the missing chain if prefill did not already cover the single attempt + # (i.e., prefill did not run, or there are chained attempts with next_attempt > 0 to pursue). + if (nrow(attempts) > 0 && (pref_res$tried == 0L || nrow(attempts[!is.na(next_attempt) & next_attempt > 0L]) > 0L)) { + skip_assumed_unit <- if (pref_res$tried > 0L) row_unit_matched else NA_character_ + miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = skip_assumed_unit) missing_attempts_tried <- missing_attempts_tried + miss_res$tried if (isTRUE(miss_res$success)) next } } } - # 2) If unit missing (or flagged missing), try attempts ordered by next_attempt (for assumed units) - if (unit_missing_flag && nrow(attempts) > 0) { - # consider explicit next_attempt chain (>0) first, then any rows that encode MISSING regardless of next_attempt - # Try the missing-attempts chain (includes explicit next_attempt chain - # and any rows labelled as MISSING). - miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE) - missing_attempts_tried <- missing_attempts_tried + miss_res$tried - if (isTRUE(miss_res$success)) next - } - # 3) Fallback: try all attempts ordered by next_attempt (1,2,...) as fallbacks for conversion conv_success_current <- if (origin_other) 1L else if (unit_missing_flag) 3L else 1L other_flow_current <- origin_other diff --git a/tests/data/Example 5/g_output/dataset_cleaned_output_lab_values.csv b/tests/data/Example 5/g_output/dataset_cleaned_output_lab_values.csv new file mode 100644 index 0000000..601b1a5 --- /dev/null +++ b/tests/data/Example 5/g_output/dataset_cleaned_output_lab_values.csv @@ -0,0 +1,5 @@ +person_id,concept_id,value_origin,unit_origin,included,value,unit_target,conversion,rule_applied +P01,LAB_HBA1C,6,%,1,42.08,mmol/mol,1,1 +P02,LAB_HBA1C,42,mmol/mol,1,42,mmol/mol,0,0 +P03,LAB_HBA1C,70,%,1,70,mmol/mol,1,1 +P04,LAB_HBA1C,70,"",1,70,mmol/mol,3,0 diff --git a/tests/data/Example 5/i_ground_truth/dataset_cleaned_lab_values.csv b/tests/data/Example 5/i_ground_truth/dataset_cleaned_lab_values.csv new file mode 100644 index 0000000..78ca7d6 --- /dev/null +++ b/tests/data/Example 5/i_ground_truth/dataset_cleaned_lab_values.csv @@ -0,0 +1,5 @@ +person_id,concept_id,value_origin,unit_origin,included,value,unit_target,conversion,rule_applied +P01,LAB_HBA1C,6.0,%,1,42.08,mmol/mol,1,1 +P02,LAB_HBA1C,42,mmol/mol,1,42,mmol/mol,0,0 +P03,LAB_HBA1C,70,%,0,,mmol/mol,1,90 +P04,LAB_HBA1C,70,,0,,mmol/mol,3,91 diff --git a/tests/data/Example 5/i_input/LAB_target_units.csv b/tests/data/Example 5/i_input/LAB_target_units.csv new file mode 100644 index 0000000..3b55a55 --- /dev/null +++ b/tests/data/Example 5/i_input/LAB_target_units.csv @@ -0,0 +1,2 @@ +concept_id,unit_target +LAB_HBA1C,mmol/mol diff --git a/tests/data/Example 5/i_input/LAB_threshold.csv b/tests/data/Example 5/i_input/LAB_threshold.csv new file mode 100644 index 0000000..efe68bd --- /dev/null +++ b/tests/data/Example 5/i_input/LAB_threshold.csv @@ -0,0 +1,2 @@ +concept_id;Min;Max;unit_target;condition_on_variable;variable +LAB_HBA1C;9;174;mmol/mol;; diff --git a/tests/data/Example 5/i_input/LAB_unit_conversion.csv b/tests/data/Example 5/i_input/LAB_unit_conversion.csv new file mode 100644 index 0000000..2075e95 --- /dev/null +++ b/tests/data/Example 5/i_input/LAB_unit_conversion.csv @@ -0,0 +1,5 @@ +concept_id,datasource,unit_origin,unit_target,multiplication_factor_from_origin_to_target,conversion_not_multiplication,conversion_rate,condition_on_value,assumed_unit_if_missing,next_attempt +LAB_HBA1C,,%,"mmol/mol",,(10.93 * value) - 23.5,,,,0 +LAB_HBA1C,DS_A,MISSING,"mmol/mol",,(10.93 * value) - 23.5,,,"%",0 +LAB_HBA1C,DS_B,MISSING,"mmol/mol",1,,1,,"mmol/mol",1 +LAB_HBA1C,DS_B,MISSING,"mmol/mol",,(10.93 * value) - 23.5,,,%,2 diff --git a/tests/data/Example 5/i_input/LAB_unit_conversion_no_DS.csv b/tests/data/Example 5/i_input/LAB_unit_conversion_no_DS.csv new file mode 100644 index 0000000..cb5fe33 --- /dev/null +++ b/tests/data/Example 5/i_input/LAB_unit_conversion_no_DS.csv @@ -0,0 +1,3 @@ +concept_id,datasource,unit_origin,unit_target,multiplication_factor_from_origin_to_target,conversion_not_multiplication,conversion_rate,condition_on_value,assumed_unit_if_missing,next_attempt +LAB_HBA1C,,%,"mmol/mol",,(10.93 * value) - 23.5,,,,0 +LAB_HBA1C,DS_A,MISSING,"mmol/mol",,(10.93 * value) - 23.5,,,"%",0 diff --git a/tests/data/Example 5/i_input/dataset_lab_values.csv b/tests/data/Example 5/i_input/dataset_lab_values.csv new file mode 100644 index 0000000..93c2961 --- /dev/null +++ b/tests/data/Example 5/i_input/dataset_lab_values.csv @@ -0,0 +1,5 @@ +person_id,concept_id,value,unit +P01,LAB_HBA1C,6.0,% +P02,LAB_HBA1C,42,mmol/mol +P03,LAB_HBA1C,70,% +P04,LAB_HBA1C,70, diff --git a/tests/test_clean_lab_temp.R b/tests/test_specific_example.R similarity index 74% rename from tests/test_clean_lab_temp.R rename to tests/test_specific_example.R index 295a26d..cf7fd78 100644 --- a/tests/test_clean_lab_temp.R +++ b/tests/test_specific_example.R @@ -1,32 +1,39 @@ -# Test script for clean_lab_main -# This script will run clean_lab_main on all Example datasets and compare output to ground truth +# Test script for debugging +# This script will run CleanLabValuesDataset on an Example dataset, save the output and compare output to ground truth, if any library(data.table) # Load modular code source("R/load_dependencies.R") load_cleanlab() -examples <- c("Example 4") +examples <- c("Example 5") base_path <- "tests/data" for (ex in examples) { cat("\nRunning test for", ex, "...\n") input_dir <- file.path(base_path, ex, "i_input") gt_dir <- file.path(base_path, ex, "i_ground_truth") - + out_dir <- file.path(base_path, ex, "g_output") + dir.create(out_dir, showWarnings = F) + dataset_lab_values <- fread(file.path(input_dir, "dataset_lab_values.csv")) path_lab_target_units <- file.path(input_dir, "LAB_target_units.csv") - path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion_wrong.csv") + path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion.csv") +# path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion_no_DS.csv") path_lab_thresholds <- file.path(input_dir, "LAB_threshold.csv") # Run cleaning cleaned <- CleanLabValuesDataset( dataset = dataset_lab_values, + datasource = "DS_A", lab_target_units = path_lab_target_units, lab_unit_conversion = path_unit_conversion, lab_thresholds = path_lab_thresholds ) + # Save output + fwrite(cleaned, file.path(out_dir,"dataset_cleaned_output_lab_values.csv")) + # Load ground truth gt_file <- file.path(gt_dir, "dataset_cleaned_lab_values.csv") if (file.exists(gt_file)) {