diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..ae0663c Binary files /dev/null and b/.DS_Store differ diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..d1fdbf7 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,18 @@ +Package: CleanLabValues +Title: Clean and Convert Laboratory Values +Version: 0.2.0 +Authors@R: + c(person("Rosa", "Gini", role = "aut"), + person("Yinan", "Mao", role = c("aut", "cre"), email = "ymao@example.com")) +Description: Ingests instructions to clean datasets containing results from + laboratory analyses. Handles unit conversions, missing-unit imputation, and + exclusion of physiologically implausible values based on configurable + metadata (target units, conversion rules, and thresholds). +License: AGPL-3 +Encoding: UTF-8 +RoxygenNote: 7.3.3 +Imports: + data.table +Suggests: + testthat (>= 3.0.0) +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..a2cef55 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,3 @@ +# Generated by roxygen2: do not edit by hand + +export(CleanLabValuesDataset) diff --git a/R/CleanLabValuesDataset.R b/R/CleanLabValuesDataset.R index b24eca8..0848ca2 100644 --- a/R/CleanLabValuesDataset.R +++ b/R/CleanLabValuesDataset.R @@ -1,16 +1,3 @@ -# Authors: Rosa Gini, Yinan Mao - -# 16 Apr 2026 - -# Version 0.2 -# Set up checks for the arguments -# set up Examples 1, ..., 3 - -# 15 Apr 2026 - -# Version 0.1 -# Set up function - #' CleanLabValuesDataset #' #' The function CleanLabValuesDataset ingests instructions to clean datasets containing results from laboratory analysis. The instructions specify which unit of measurement is desired for each laboratory analysis, what the conversion rules are, what to do if unit of measurement is missing, what the values should be considered absurd and discarded. @@ -19,9 +6,12 @@ #' @param dataset the name of a data.table file in memory that contains a dataset of results of laboratory analyses that needs cleaning #' @param list_analyses a string vector containing the names of the laboratory analyses to be cleaned. If the argument is not specified, all the laboratory analyses are cleaned #' @param lab_target_units a string containing the path towards a csv files containing one record per each type of laboratory analysis in list_analyses and specifying the desired unit of measurement -#' @param lab_unit_conversions a string containing the path towards a csv files containing the specifications to convert the values in the dataset to the target unit of measurement +#' @param lab_unit_conversion a string containing the path towards a csv files containing the specifications to convert the values in the dataset to the target unit of measurement #' @param lab_thresholds a string containing the path towards a csv files containing the specifications of which values should be considered absurd and discarded, possibly depending on other variables such as age -#' @param datasource (non mandatory) a string containing name of the datasource that can be stored in lab_unit_conversions to produce a datasource-specific assumption on what to do if the unit of measurement is missing +#' @param datasource (non mandatory) a string containing name of the datasource that can be stored in `lab_unit_conversion` to produce a datasource-specific assumption on what to do if the unit of measurement is missing +#' @return A `data.table` with the cleaned lab results and appended result columns. +#' +#' @export #' #' #' @details @@ -35,11 +25,6 @@ # CleanLabValuesDataset <- function(dataset, list_analyses = c(), lab_target_units, lab_unit_conversion, lab_thresholds, datasource = "") { - # Ensure check functions are available; source minimal checks if needed - if (!exists("check_dataset_model")) { - if (file.exists("R/check_metadata.R")) source("R/check_metadata.R") else stop("Missing R/check_metadata.R") - } - # Basic validation of dataset and metadata files if (!is.data.frame(dataset) && !data.table::is.data.table(dataset)) stop("`dataset` must be a data.frame or data.table") for (varname in c("concept_id", "value", "unit")) if (!(varname %in% names(dataset))) stop(paste("dataset must contain column", varname)) diff --git a/R/check_metadata.R b/R/check_metadata.R index b20d66f..9671ff5 100644 --- a/R/check_metadata.R +++ b/R/check_metadata.R @@ -1,5 +1,14 @@ -# Functions for metadata and input checks - +#' Validate a condition string against a data.table +#' +#' Check that `cond` is a single, parseable R expression that only +#' references columns present in `dt` and evaluates to a logical vector of +#' length 1 or `nrow(dt)`. +#' +#' @param dt A `data.frame` or `data.table` containing the variables used in `cond`. +#' @param cond A single string containing an R expression that can be evaluated +#' in the context of `dt` (e.g. "value > 0"). +#' @return `TRUE` if the condition is valid, `FALSE` otherwise. +#' @keywords internal is_valid_dt_condition <- function(dt, cond) { if (!is.character(cond) || length(cond) != 1L || is.na(cond)) { return(FALSE) @@ -26,6 +35,14 @@ is_valid_dt_condition <- function(dt, cond) { TRUE } +#' Check dataset model +#' +#' Ensure `dataset` contains the minimal variables required by the +#' cleaning pipeline (`concept_id`, `value`, `unit`). +#' +#' @param dataset A `data.frame` or `data.table` representing the input dataset. +#' @return Invisibly returns `NULL` on success, otherwise throws an error. +#' @keywords internal check_dataset_model <- function(dataset) { for (varname in c("concept_id", "value", "unit")) { if (!(varname %in% names(dataset))) { @@ -49,6 +66,15 @@ check_dataset_model <- function(dataset) { ####################################################### # lab_target_units +#' Check LAB_target_units file +#' +#' Validate that the `LAB_target_units` CSV exists and contains at least +#' `concept_id` and `unit_target` columns. Returns the parsed data.table +#' invisibly on success. +#' +#' @param lab_target_units Path to the `LAB_target_units` CSV file. +#' @return A `data.table` read from `lab_target_units` (invisibly). +#' @keywords internal check_lab_target_units <- function(lab_target_units) { if (!(file.exists(lab_target_units))) { stop(paste("The file", lab_target_units, "cannot be found")) @@ -65,6 +91,18 @@ check_lab_target_units <- function(lab_target_units) { ####################################################### # lab_unit_conversion +#' Check LAB_unit_conversion file +#' +#' Validate the unit conversion metadata file and basic consistency with +#' `LAB_target_units`. Returns the parsed `data.table` invisibly. +#' +#' @param lab_unit_conversion Path to the `LAB_unit_conversion` CSV file. +#' @param datasource Optional datasource identifier (string) used to check +#' for a `datasource` column when provided. +#' @param list_analyses Character vector of `concept_id` values expected. +#' @param target_unit Named character vector mapping `concept_id` -> `unit_target`. +#' @return A `data.table` read from `lab_unit_conversion` (invisibly). +#' @keywords internal check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_analyses, target_unit) { if (!(file.exists(lab_unit_conversion))) { stop(paste("The file", lab_unit_conversion, "cannot be found")) @@ -255,6 +293,15 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal ##################################### # lab_thresholds +#' Check LAB_thresholds file +#' +#' Validate the `LAB_thresholds` CSV file structure and ensure `Min`/`Max` +#' columns are numeric where required. +#' +#' @param lab_thresholds Path to the `LAB_thresholds` CSV file. +#' @param dataset The input dataset (used to validate numeric variables referenced by thresholds). +#' @return A `data.table` read from `lab_thresholds` (invisibly). +#' @keywords internal check_lab_thresholds <- function(lab_thresholds, dataset) { if (!(file.exists(lab_thresholds))) { stop(paste("The file", lab_thresholds, "cannot be found")) diff --git a/R/clean_lab_main.R b/R/clean_lab_main.R index 8489980..b1c56b7 100644 --- a/R/clean_lab_main.R +++ b/R/clean_lab_main.R @@ -1,11 +1,21 @@ -# Main cleaning function for lab values +##' Main cleaning pipeline implementation +#' +#' Internal implementation of the lab cleaning pipeline. Use +#' `CleanLabValuesDataset()` as the user-facing wrapper. +#' @param dataset A `data.frame` or `data.table` with lab measurements. +#' @param list_analyses Character vector of `concept_id` to process (default: all). +#' @param lab_target_units Path to the `LAB_target_units` CSV file. +#' @param lab_unit_conversion Path to the `LAB_unit_conversion` CSV file. +#' @param lab_thresholds Path to the `LAB_thresholds` CSV file. +#' @param datasource Optional datasource identifier (string). +#' @return A `data.table` containing cleaned rows and appended result columns. +#' - `included`: 1/0 whether the value is kept +#' - `value`: cleaned/converted value when `included == 1`, otherwise NA +#' - `unit_target`: the target unit assigned for the concept +#' - `conversion`: integer code indicating conversion origin/type (0/1/2/3) +#' - `rule_applied`: integer code indicating which rule was applied or failure reason +#' @keywords internal # -# Output columns appended by this pipeline (see README for full semantics): -# - `included`: 1/0 whether the value is kept -# - `value`: cleaned/converted value when `included == 1`, otherwise NA -# - `unit_target`: the target unit assigned for the concept -# - `conversion`: integer code indicating conversion origin/type (0/1/2/3) -# - `rule_applied`: integer code indicating which rule was applied or failure reason clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_unit_conversion, lab_thresholds, datasource = "") { # Ensure input is a data.table and capture original input column order dataset <- data.table::as.data.table(dataset) diff --git a/R/fill_missing_unit.R b/R/fill_missing_unit.R index 845597f..b13cb8c 100644 --- a/R/fill_missing_unit.R +++ b/R/fill_missing_unit.R @@ -1,4 +1,18 @@ -# Fill missing units in the dataset based on metadata +## Fill missing units in the dataset based on metadata +#' Fill missing unit values based on metadata rules +#' +#' Use `meta_unit_conv` and `target_unit` to populate a `unit_filled` +#' column in `dt`. If an assumed unit is specified in the metadata it is +#' used; otherwise the `target_unit` mapping is applied. +#' +#' @param dt A `data.table` with the measurements. Operates by reference and +#' returns the modified `data.table`. +#' @param meta_unit_conv A `data.table` providing conversion/assumption rules. +#' @param target_unit A named character vector mapping `concept_id` to `unit_target`. +#' @param concept_id_col Name of the concept id column in `dt` (default: "concept_id"). +#' @param unit_col Name of the unit column in `dt` (default: "unit"). +#' @return The input `data.table` with a new `unit_filled` column. +#' @keywords internal fill_missing_unit <- function(dt, meta_unit_conv, target_unit, concept_id_col = "concept_id", unit_col = "unit") { dt[, unit_filled := get(unit_col)] for (cid in unique(dt[[concept_id_col]])) { diff --git a/R/load_dependencies.R b/R/load_dependencies.R index 1f4c06e..8f322e9 100644 --- a/R/load_dependencies.R +++ b/R/load_dependencies.R @@ -1,26 +1,10 @@ -# Load all CleanLabValues module scripts in a stable order -load_cleanlab <- function() { - srcs <- c( - "R/check_metadata.R", - "R/fill_missing_unit.R", - "R/mo_convert.R", - "R/clean_lab_main.R", - "R/CleanLabValuesDataset.R" - ) - # Try sourcing files relative to a few likely project roots so tests - # running from `tests/testthat` still find the R/ scripts. - roots <- c(".", "..", "..", file.path("..", "..")) - for (s in srcs) { - found <- FALSE - for (r in roots) { - p <- file.path(r, s) - if (file.exists(p)) { - source(p) - found <- TRUE - break - } - } - if (!found) stop(paste("Missing required file:", s)) - } - invisible(TRUE) -} +##' Load dependencies (deprecated for package) +##' +##' This file historically contained `load_cleanlab()` which sourced the +##' module scripts for interactive/testing workflows. In the installed +##' package this manual sourcing is unnecessary because functions are +##' exported via the `NAMESPACE` and loaded automatically. Keep this file +##' as a documentation placeholder only. +##' +##' @keywords internal +NULL diff --git a/R/mo_convert.R b/R/mo_convert.R index 1fcbd98..d922bbc 100644 --- a/R/mo_convert.R +++ b/R/mo_convert.R @@ -1,31 +1,12 @@ -# Convert lab values to target units and flag inclusion, using data.table -# -# Output conventions (matching README): -# - `conversion`: type of conversion from origin value to final value: -# - `0`: no conversion; -# - `1`: conversion from non-missing unit (conversion applied using origin unit rules); -# - `2`: conversion from `OTHER` unit. `OTHER` means the input `unit_origin` is -# a non-empty string but not listed among conversion rules. `OTHER` rows are -# processed through the same attempt/next_attempt chain as `MISSING` rows -# (prefilled assumed unit -> explicit next_attempt chain -> missing rows -> fallbacks), -# but successful results from the `OTHER` flow are marked `conversion = 2`. -# - `3`: conversion from `MISSING` unit (unit was empty/NA and missing-unit rules applied). -# -# - `rule_applied`: evaluation of the conversion: -# - `0`: no conversion needed and result accepted; -# - `1`: conversion needed and result accepted (single attempt accepted or direct match); -# - `2`: more than one conversion needed before acceptance (fallbacks); -# - `90`: no conversion possible and result discarded (no attempts applicable); -# - `91`: one conversion attempted before discarding the result; -# - `92`: more than one conversion attempted before discarding the result; -# - `99`: discarded because value is non-numeric. -# -# Notes: -# - Attempt counting (`n_conversion_attempts`) is used internally to compute 90/91/92. -# - The `OTHER` flow intentionally mirrors `MISSING` in attempt ordering and threshold -# decisioning; only the final `conversion` code differs (2 vs 3). - -## Internal helpers for mo_convert (not exported) +#' Unit conversion helpers +#' +#' Internal helpers and the `mo_convert()` implementation used to convert +#' lab values to target units and compute inclusion/conversion codes. +#' +#' These functions are internal to the package and are not exported. +#' +#' @keywords internal +NULL .mo_norm <- function(x) tolower(trimws(as.character(x))) .mo_get_var_value <- function(dat, row_idx, varname) { @@ -221,7 +202,17 @@ FALSE } -# Refactored mo_convert to incrementally try next_attempt (1, 2, ...) for each concept_id/unit_target, discarding if max next_attempt is reached +##' Convert values using conversion metadata +#' +#' Apply conversion rules described in `metadata_convert` to the rows of +#' `dat_unit_matched`. This returns a copy of the input with `included`, +#' `value_converted`, `conversion`, and `rule_applied` columns populated. +#' +#' @param dat_unit_matched A `data.table` containing rows to convert. Must +#' include `concept_id`, `value`, `unit_origin`, and `unit_target`. +#' @param metadata_convert A `data.table` specifying conversion attempts and thresholds. +#' @return A `data.table` with conversion result columns added. +#' @keywords internal mo_convert <- function(dat_unit_matched, metadata_convert) { # Clean, deterministic implementation of the conversion logic described in README. # For each row: try direct unit matches, then follow next_attempt order for missing units diff --git a/man/CleanLabValuesDataset.Rd b/man/CleanLabValuesDataset.Rd new file mode 100644 index 0000000..0b1bd73 --- /dev/null +++ b/man/CleanLabValuesDataset.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CleanLabValuesDataset.R +\name{CleanLabValuesDataset} +\alias{CleanLabValuesDataset} +\title{CleanLabValuesDataset} +\usage{ +CleanLabValuesDataset( + dataset, + list_analyses = c(), + lab_target_units, + lab_unit_conversion, + lab_thresholds, + datasource = "" +) +} +\arguments{ +\item{dataset}{the name of a data.table file in memory that contains a dataset of results of laboratory analyses that needs cleaning} + +\item{list_analyses}{a string vector containing the names of the laboratory analyses to be cleaned. If the argument is not specified, all the laboratory analyses are cleaned} + +\item{lab_target_units}{a string containing the path towards a csv files containing one record per each type of laboratory analysis in list_analyses and specifying the desired unit of measurement} + +\item{lab_unit_conversion}{a string containing the path towards a csv files containing the specifications to convert the values in the dataset to the target unit of measurement} + +\item{lab_thresholds}{a string containing the path towards a csv files containing the specifications of which values should be considered absurd and discarded, possibly depending on other variables such as age} + +\item{datasource}{(non mandatory) a string containing name of the datasource that can be stored in `lab_unit_conversion` to produce a datasource-specific assumption on what to do if the unit of measurement is missing} +} +\value{ +A `data.table` with the cleaned lab results and appended result columns. +} +\description{ +The function CleanLabValuesDataset ingests instructions to clean datasets containing results from laboratory analysis. The instructions specify which unit of measurement is desired for each laboratory analysis, what the conversion rules are, what to do if unit of measurement is missing, what the values should be considered absurd and discarded. +} +\details{ +... +} +\seealso{ +... +} diff --git a/man/check_dataset_model.Rd b/man/check_dataset_model.Rd new file mode 100644 index 0000000..482ae2c --- /dev/null +++ b/man/check_dataset_model.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_metadata.R +\name{check_dataset_model} +\alias{check_dataset_model} +\title{Check dataset model} +\usage{ +check_dataset_model(dataset) +} +\arguments{ +\item{dataset}{A `data.frame` or `data.table` representing the input dataset.} +} +\value{ +Invisibly returns `NULL` on success, otherwise throws an error. +} +\description{ +Ensure `dataset` contains the minimal variables required by the +cleaning pipeline (`concept_id`, `value`, `unit`). +} +\keyword{internal} diff --git a/man/check_lab_target_units.Rd b/man/check_lab_target_units.Rd new file mode 100644 index 0000000..afae03d --- /dev/null +++ b/man/check_lab_target_units.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_metadata.R +\name{check_lab_target_units} +\alias{check_lab_target_units} +\title{Check LAB_target_units file} +\usage{ +check_lab_target_units(lab_target_units) +} +\arguments{ +\item{lab_target_units}{Path to the `LAB_target_units` CSV file.} +} +\value{ +A `data.table` read from `lab_target_units` (invisibly). +} +\description{ +Validate that the `LAB_target_units` CSV exists and contains at least +`concept_id` and `unit_target` columns. Returns the parsed data.table +invisibly on success. +} +\keyword{internal} diff --git a/man/check_lab_thresholds.Rd b/man/check_lab_thresholds.Rd new file mode 100644 index 0000000..278c60c --- /dev/null +++ b/man/check_lab_thresholds.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_metadata.R +\name{check_lab_thresholds} +\alias{check_lab_thresholds} +\title{Check LAB_thresholds file} +\usage{ +check_lab_thresholds(lab_thresholds, dataset) +} +\arguments{ +\item{lab_thresholds}{Path to the `LAB_thresholds` CSV file.} + +\item{dataset}{The input dataset (used to validate numeric variables referenced by thresholds).} +} +\value{ +A `data.table` read from `lab_thresholds` (invisibly). +} +\description{ +Validate the `LAB_thresholds` CSV file structure and ensure `Min`/`Max` +columns are numeric where required. +} +\keyword{internal} diff --git a/man/check_lab_unit_conversion.Rd b/man/check_lab_unit_conversion.Rd new file mode 100644 index 0000000..9b45920 --- /dev/null +++ b/man/check_lab_unit_conversion.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_metadata.R +\name{check_lab_unit_conversion} +\alias{check_lab_unit_conversion} +\title{Check LAB_unit_conversion file} +\usage{ +check_lab_unit_conversion( + lab_unit_conversion, + datasource, + list_analyses, + target_unit +) +} +\arguments{ +\item{lab_unit_conversion}{Path to the `LAB_unit_conversion` CSV file.} + +\item{datasource}{Optional datasource identifier (string) used to check +for a `datasource` column when provided.} + +\item{list_analyses}{Character vector of `concept_id` values expected.} + +\item{target_unit}{Named character vector mapping `concept_id` -> `unit_target`.} +} +\value{ +A `data.table` read from `lab_unit_conversion` (invisibly). +} +\description{ +Validate the unit conversion metadata file and basic consistency with +`LAB_target_units`. Returns the parsed `data.table` invisibly. +} +\keyword{internal} diff --git a/man/clean_lab_main.Rd b/man/clean_lab_main.Rd new file mode 100644 index 0000000..318035a --- /dev/null +++ b/man/clean_lab_main.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean_lab_main.R +\name{clean_lab_main} +\alias{clean_lab_main} +\title{Main cleaning pipeline implementation} +\usage{ +clean_lab_main( + dataset, + list_analyses = c(), + lab_target_units, + lab_unit_conversion, + lab_thresholds, + datasource = "" +) +} +\arguments{ +\item{dataset}{A `data.frame` or `data.table` with lab measurements.} + +\item{list_analyses}{Character vector of `concept_id` to process (default: all).} + +\item{lab_target_units}{Path to the `LAB_target_units` CSV file.} + +\item{lab_unit_conversion}{Path to the `LAB_unit_conversion` CSV file.} + +\item{lab_thresholds}{Path to the `LAB_thresholds` CSV file.} + +\item{datasource}{Optional datasource identifier (string).} +} +\value{ +A `data.table` containing cleaned rows and appended result columns. +- `included`: 1/0 whether the value is kept +- `value`: cleaned/converted value when `included == 1`, otherwise NA +- `unit_target`: the target unit assigned for the concept +- `conversion`: integer code indicating conversion origin/type (0/1/2/3) +- `rule_applied`: integer code indicating which rule was applied or failure reason +} +\description{ +Internal implementation of the lab cleaning pipeline. Use +`CleanLabValuesDataset()` as the user-facing wrapper. +} +\keyword{internal} diff --git a/man/fill_missing_unit.Rd b/man/fill_missing_unit.Rd new file mode 100644 index 0000000..a12b844 --- /dev/null +++ b/man/fill_missing_unit.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fill_missing_unit.R +\name{fill_missing_unit} +\alias{fill_missing_unit} +\title{Fill missing unit values based on metadata rules} +\usage{ +fill_missing_unit( + dt, + meta_unit_conv, + target_unit, + concept_id_col = "concept_id", + unit_col = "unit" +) +} +\arguments{ +\item{dt}{A `data.table` with the measurements. Operates by reference and +returns the modified `data.table`.} + +\item{meta_unit_conv}{A `data.table` providing conversion/assumption rules.} + +\item{target_unit}{A named character vector mapping `concept_id` to `unit_target`.} + +\item{concept_id_col}{Name of the concept id column in `dt` (default: "concept_id").} + +\item{unit_col}{Name of the unit column in `dt` (default: "unit").} +} +\value{ +The input `data.table` with a new `unit_filled` column. +} +\description{ +Use `meta_unit_conv` and `target_unit` to populate a `unit_filled` +column in `dt`. If an assumed unit is specified in the metadata it is +used; otherwise the `target_unit` mapping is applied. +} +\keyword{internal} diff --git a/man/is_valid_dt_condition.Rd b/man/is_valid_dt_condition.Rd new file mode 100644 index 0000000..5c3c0b6 --- /dev/null +++ b/man/is_valid_dt_condition.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_metadata.R +\name{is_valid_dt_condition} +\alias{is_valid_dt_condition} +\title{Validate a condition string against a data.table} +\usage{ +is_valid_dt_condition(dt, cond) +} +\arguments{ +\item{dt}{A `data.frame` or `data.table` containing the variables used in `cond`.} + +\item{cond}{A single string containing an R expression that can be evaluated +in the context of `dt` (e.g. "value > 0").} +} +\value{ +`TRUE` if the condition is valid, `FALSE` otherwise. +} +\description{ +Check that `cond` is a single, parseable R expression that only +references columns present in `dt` and evaluates to a logical vector of +length 1 or `nrow(dt)`. +} +\keyword{internal} diff --git a/man/mo_convert.Rd b/man/mo_convert.Rd new file mode 100644 index 0000000..28dfd67 --- /dev/null +++ b/man/mo_convert.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mo_convert.R +\name{mo_convert} +\alias{mo_convert} +\title{Convert values using conversion metadata} +\usage{ +mo_convert(dat_unit_matched, metadata_convert) +} +\arguments{ +\item{dat_unit_matched}{A `data.table` containing rows to convert. Must +include `concept_id`, `value`, `unit_origin`, and `unit_target`.} + +\item{metadata_convert}{A `data.table` specifying conversion attempts and thresholds.} +} +\value{ +A `data.table` with conversion result columns added. +} +\description{ +Apply conversion rules described in `metadata_convert` to the rows of +`dat_unit_matched`. This returns a copy of the input with `included`, +`value_converted`, `conversion`, and `rule_applied` columns populated. +} +\keyword{internal} diff --git a/tests/test_clean_lab_main.R b/tests/test_clean_lab_main.R deleted file mode 100644 index 33cad3f..0000000 --- a/tests/test_clean_lab_main.R +++ /dev/null @@ -1,49 +0,0 @@ -# Test script for clean_lab_main -# This script will run clean_lab_main on all Example datasets and compare output to ground truth - -library(data.table) -# Load modular code -source("R/load_dependencies.R") -load_cleanlab() - -examples <- c("Example 1", "Example 2", "Example 3", "Example 4") -base_path <- "tests/data" - -for (ex in examples) { - cat("\nRunning test for", ex, "...\n") - input_dir <- file.path(base_path, ex, "i_input") - gt_dir <- file.path(base_path, ex, "i_ground_truth") - - dataset_lab_values <- fread(file.path(input_dir, "dataset_lab_values.csv")) - path_lab_target_units <- file.path(input_dir, "LAB_target_units.csv") - path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion.csv") - path_lab_thresholds <- file.path(input_dir, "LAB_threshold.csv") - - # Run cleaning - cleaned <- CleanLabValuesDataset( - dataset = dataset_lab_values, - lab_target_units = path_lab_target_units, - lab_unit_conversion = path_unit_conversion, - lab_thresholds = path_lab_thresholds - ) - - # Load ground truth - gt_file <- file.path(gt_dir, "dataset_cleaned_lab_values.csv") - if (file.exists(gt_file)) { - gt <- fread(gt_file) - # Order both by person_id and concept_id for fair comparison (no date column) - setorder(cleaned, person_id, concept_id) - setorder(gt, person_id, concept_id) - - res <- all.equal(cleaned, gt, check.attributes = FALSE) - if (isTRUE(res)) { - cat("Test PASSED for", ex, "\n") - } else { - cat("Test FAILED for", ex, "\n") - # print differences for debugging - print(res) - } - } else { - cat("Ground truth file missing for", ex, "\n") - } -} diff --git a/tests/test_clean_lab_temp.R b/tests/test_clean_lab_temp.R deleted file mode 100644 index 295a26d..0000000 --- a/tests/test_clean_lab_temp.R +++ /dev/null @@ -1,49 +0,0 @@ -# Test script for clean_lab_main -# This script will run clean_lab_main on all Example datasets and compare output to ground truth - -library(data.table) -# Load modular code -source("R/load_dependencies.R") -load_cleanlab() - -examples <- c("Example 4") -base_path <- "tests/data" - -for (ex in examples) { - cat("\nRunning test for", ex, "...\n") - input_dir <- file.path(base_path, ex, "i_input") - gt_dir <- file.path(base_path, ex, "i_ground_truth") - - dataset_lab_values <- fread(file.path(input_dir, "dataset_lab_values.csv")) - path_lab_target_units <- file.path(input_dir, "LAB_target_units.csv") - path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion_wrong.csv") - path_lab_thresholds <- file.path(input_dir, "LAB_threshold.csv") - - # Run cleaning - cleaned <- CleanLabValuesDataset( - dataset = dataset_lab_values, - lab_target_units = path_lab_target_units, - lab_unit_conversion = path_unit_conversion, - lab_thresholds = path_lab_thresholds - ) - - # Load ground truth - gt_file <- file.path(gt_dir, "dataset_cleaned_lab_values.csv") - if (file.exists(gt_file)) { - gt <- fread(gt_file) - # Order both by person_id and concept_id for fair comparison (no date column) - setorder(cleaned, person_id, concept_id) - setorder(gt, person_id, concept_id) - - res <- all.equal(cleaned, gt, check.attributes = FALSE) - if (isTRUE(res)) { - cat("Test PASSED for", ex, "\n") - } else { - cat("Test FAILED for", ex, "\n") - # print differences for debugging - print(res) - } - } else { - cat("Ground truth file missing for", ex, "\n") - } -} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..fa39d82 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(CleanLabValues) + +test_check("CleanLabValues") diff --git a/tests/data/Example 1/i_ground_truth/dataset_cleaned_lab_values.csv b/tests/testthat/data/Example 1/i_ground_truth/dataset_cleaned_lab_values.csv similarity index 100% rename from tests/data/Example 1/i_ground_truth/dataset_cleaned_lab_values.csv rename to tests/testthat/data/Example 1/i_ground_truth/dataset_cleaned_lab_values.csv diff --git a/tests/data/Example 1/i_input/LAB_target_units.csv b/tests/testthat/data/Example 1/i_input/LAB_target_units.csv similarity index 100% rename from tests/data/Example 1/i_input/LAB_target_units.csv rename to tests/testthat/data/Example 1/i_input/LAB_target_units.csv diff --git a/tests/data/Example 1/i_input/LAB_threshold.csv b/tests/testthat/data/Example 1/i_input/LAB_threshold.csv similarity index 100% rename from tests/data/Example 1/i_input/LAB_threshold.csv rename to tests/testthat/data/Example 1/i_input/LAB_threshold.csv diff --git a/tests/data/Example 1/i_input/LAB_unit_conversion.csv b/tests/testthat/data/Example 1/i_input/LAB_unit_conversion.csv similarity index 100% rename from tests/data/Example 1/i_input/LAB_unit_conversion.csv rename to tests/testthat/data/Example 1/i_input/LAB_unit_conversion.csv diff --git a/tests/data/Example 1/i_input/dataset_lab_values.csv b/tests/testthat/data/Example 1/i_input/dataset_lab_values.csv similarity index 100% rename from tests/data/Example 1/i_input/dataset_lab_values.csv rename to tests/testthat/data/Example 1/i_input/dataset_lab_values.csv diff --git a/tests/data/Example 2/i_ground_truth/dataset_cleaned_lab_values.csv b/tests/testthat/data/Example 2/i_ground_truth/dataset_cleaned_lab_values.csv similarity index 100% rename from tests/data/Example 2/i_ground_truth/dataset_cleaned_lab_values.csv rename to tests/testthat/data/Example 2/i_ground_truth/dataset_cleaned_lab_values.csv diff --git a/tests/data/Example 2/i_input/LAB_target_units.csv b/tests/testthat/data/Example 2/i_input/LAB_target_units.csv similarity index 100% rename from tests/data/Example 2/i_input/LAB_target_units.csv rename to tests/testthat/data/Example 2/i_input/LAB_target_units.csv diff --git a/tests/data/Example 2/i_input/LAB_threshold.csv b/tests/testthat/data/Example 2/i_input/LAB_threshold.csv similarity index 100% rename from tests/data/Example 2/i_input/LAB_threshold.csv rename to tests/testthat/data/Example 2/i_input/LAB_threshold.csv diff --git a/tests/data/Example 2/i_input/LAB_unit_conversion.csv b/tests/testthat/data/Example 2/i_input/LAB_unit_conversion.csv similarity index 100% rename from tests/data/Example 2/i_input/LAB_unit_conversion.csv rename to tests/testthat/data/Example 2/i_input/LAB_unit_conversion.csv diff --git a/tests/data/Example 2/i_input/dataset_lab_values.csv b/tests/testthat/data/Example 2/i_input/dataset_lab_values.csv similarity index 100% rename from tests/data/Example 2/i_input/dataset_lab_values.csv rename to tests/testthat/data/Example 2/i_input/dataset_lab_values.csv diff --git a/tests/data/Example 3/i_ground_truth/dataset_cleaned_lab_values.csv b/tests/testthat/data/Example 3/i_ground_truth/dataset_cleaned_lab_values.csv similarity index 100% rename from tests/data/Example 3/i_ground_truth/dataset_cleaned_lab_values.csv rename to tests/testthat/data/Example 3/i_ground_truth/dataset_cleaned_lab_values.csv diff --git a/tests/data/Example 3/i_input/LAB_target_units.csv b/tests/testthat/data/Example 3/i_input/LAB_target_units.csv similarity index 100% rename from tests/data/Example 3/i_input/LAB_target_units.csv rename to tests/testthat/data/Example 3/i_input/LAB_target_units.csv diff --git a/tests/data/Example 3/i_input/LAB_threshold.csv b/tests/testthat/data/Example 3/i_input/LAB_threshold.csv similarity index 100% rename from tests/data/Example 3/i_input/LAB_threshold.csv rename to tests/testthat/data/Example 3/i_input/LAB_threshold.csv diff --git a/tests/data/Example 3/i_input/LAB_unit_conversion.csv b/tests/testthat/data/Example 3/i_input/LAB_unit_conversion.csv similarity index 100% rename from tests/data/Example 3/i_input/LAB_unit_conversion.csv rename to tests/testthat/data/Example 3/i_input/LAB_unit_conversion.csv diff --git a/tests/data/Example 3/i_input/dataset_lab_values.csv b/tests/testthat/data/Example 3/i_input/dataset_lab_values.csv similarity index 100% rename from tests/data/Example 3/i_input/dataset_lab_values.csv rename to tests/testthat/data/Example 3/i_input/dataset_lab_values.csv diff --git a/tests/data/Example 4/i_ground_truth/dataset_cleaned_lab_values.csv b/tests/testthat/data/Example 4/i_ground_truth/dataset_cleaned_lab_values.csv similarity index 100% rename from tests/data/Example 4/i_ground_truth/dataset_cleaned_lab_values.csv rename to tests/testthat/data/Example 4/i_ground_truth/dataset_cleaned_lab_values.csv diff --git a/tests/data/Example 4/i_input/LAB_target_units.csv b/tests/testthat/data/Example 4/i_input/LAB_target_units.csv similarity index 100% rename from tests/data/Example 4/i_input/LAB_target_units.csv rename to tests/testthat/data/Example 4/i_input/LAB_target_units.csv diff --git a/tests/data/Example 4/i_input/LAB_threshold.csv b/tests/testthat/data/Example 4/i_input/LAB_threshold.csv similarity index 100% rename from tests/data/Example 4/i_input/LAB_threshold.csv rename to tests/testthat/data/Example 4/i_input/LAB_threshold.csv diff --git a/tests/data/Example 4/i_input/LAB_threshold_wrong.csv b/tests/testthat/data/Example 4/i_input/LAB_threshold_wrong.csv similarity index 100% rename from tests/data/Example 4/i_input/LAB_threshold_wrong.csv rename to tests/testthat/data/Example 4/i_input/LAB_threshold_wrong.csv diff --git a/tests/data/Example 4/i_input/LAB_unit_conversion.csv b/tests/testthat/data/Example 4/i_input/LAB_unit_conversion.csv similarity index 100% rename from tests/data/Example 4/i_input/LAB_unit_conversion.csv rename to tests/testthat/data/Example 4/i_input/LAB_unit_conversion.csv diff --git a/tests/data/Example 4/i_input/LAB_unit_conversion_wrong.csv b/tests/testthat/data/Example 4/i_input/LAB_unit_conversion_wrong.csv similarity index 100% rename from tests/data/Example 4/i_input/LAB_unit_conversion_wrong.csv rename to tests/testthat/data/Example 4/i_input/LAB_unit_conversion_wrong.csv diff --git a/tests/data/Example 4/i_input/dataset_lab_values.csv b/tests/testthat/data/Example 4/i_input/dataset_lab_values.csv similarity index 100% rename from tests/data/Example 4/i_input/dataset_lab_values.csv rename to tests/testthat/data/Example 4/i_input/dataset_lab_values.csv diff --git a/tests/data/Example 4/i_input/dataset_lab_values_wrong.csv b/tests/testthat/data/Example 4/i_input/dataset_lab_values_wrong.csv similarity index 100% rename from tests/data/Example 4/i_input/dataset_lab_values_wrong.csv rename to tests/testthat/data/Example 4/i_input/dataset_lab_values_wrong.csv diff --git a/tests/testthat/test-examples.R b/tests/testthat/test-examples.R new file mode 100644 index 0000000..639c447 --- /dev/null +++ b/tests/testthat/test-examples.R @@ -0,0 +1,44 @@ +library(CleanLabValues) +library(data.table) + +base_path <- function(...) test_path("data", ...) + +run_example <- function(ex) { + input_dir <- base_path(ex, "i_input") + gt_dir <- base_path(ex, "i_ground_truth") + + dataset <- fread(file.path(input_dir, "dataset_lab_values.csv")) + path_target_units <- file.path(input_dir, "LAB_target_units.csv") + path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion.csv") + path_thresholds <- file.path(input_dir, "LAB_threshold.csv") + + cleaned <- CleanLabValuesDataset( + dataset = dataset, + lab_target_units = path_target_units, + lab_unit_conversion = path_unit_conversion, + lab_thresholds = path_thresholds + ) + + gt <- fread(file.path(gt_dir, "dataset_cleaned_lab_values.csv")) + + setorder(cleaned, person_id, concept_id) + setorder(gt, person_id, concept_id) + + all.equal(cleaned, gt, check.attributes = FALSE) +} + +test_that("Example 1 matches ground truth", { + expect_true(isTRUE(run_example("Example 1"))) +}) + +test_that("Example 2 matches ground truth", { + expect_true(isTRUE(run_example("Example 2"))) +}) + +test_that("Example 3 matches ground truth", { + expect_true(isTRUE(run_example("Example 3"))) +}) + +test_that("Example 4 matches ground truth", { + expect_true(isTRUE(run_example("Example 4"))) +}) diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R new file mode 100644 index 0000000..c937357 --- /dev/null +++ b/tests/testthat/test-validation.R @@ -0,0 +1,30 @@ +library(CleanLabValues) +library(data.table) + +# Verify that using an incorrect unit-conversion file (which passes schema +# validation but encodes the wrong conversion formula) produces results that +# differ from the ground truth, demonstrating the importance of accurate +# metadata. +test_that("Example 4 with wrong unit conversion does not match ground truth", { + input_dir <- test_path("data", "Example 4", "i_input") + gt_dir <- test_path("data", "Example 4", "i_ground_truth") + + dataset <- fread(file.path(input_dir, "dataset_lab_values.csv")) + path_target_units <- file.path(input_dir, "LAB_target_units.csv") + path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion_wrong.csv") + path_thresholds <- file.path(input_dir, "LAB_threshold.csv") + + cleaned <- CleanLabValuesDataset( + dataset = dataset, + lab_target_units = path_target_units, + lab_unit_conversion = path_unit_conversion, + lab_thresholds = path_thresholds + ) + + gt <- fread(file.path(gt_dir, "dataset_cleaned_lab_values.csv")) + + setorder(cleaned, person_id, concept_id) + setorder(gt, person_id, concept_id) + + expect_false(isTRUE(all.equal(cleaned, gt, check.attributes = FALSE))) +})