VAC4EU · YMao-UMCU · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/.DS_Store b/.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,18 @@
+Package: CleanLabValues
+Title: Clean and Convert Laboratory Values
+Version: 0.2.0
+Authors@R:
+    c(person("Rosa", "Gini", role = "aut"),
+      person("Yinan", "Mao", role = c("aut", "cre"), email = "ymao@example.com"))
+Description: Ingests instructions to clean datasets containing results from
+    laboratory analyses. Handles unit conversions, missing-unit imputation, and
+    exclusion of physiologically implausible values based on configurable
+    metadata (target units, conversion rules, and thresholds).
+License: AGPL-3
+Encoding: UTF-8
+RoxygenNote: 7.3.3
+Imports:
+    data.table
+Suggests:
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,3 @@
+# Generated by roxygen2: do not edit by hand
+
+export(CleanLabValuesDataset)
diff --git a/R/CleanLabValuesDataset.R b/R/CleanLabValuesDataset.R
@@ -1,16 +1,3 @@
-# Authors: Rosa Gini, Yinan Mao
-
-# 16 Apr 2026
-
-# Version 0.2
-# Set up checks for the arguments
-# set up Examples 1, ..., 3
-
-# 15 Apr 2026
-
-# Version 0.1
-# Set up function
-
 #' CleanLabValuesDataset
 #'
 #' The function CleanLabValuesDataset ingests instructions to clean datasets containing results from laboratory analysis. The instructions specify which unit of measurement is desired for each laboratory analysis, what the conversion rules are, what to do if unit of measurement is missing, what the values should be considered absurd and discarded.
@@ -19,9 +6,12 @@
 #' @param dataset the name of a data.table file in memory that contains a dataset of results of laboratory analyses that needs cleaning
 #' @param list_analyses a string vector containing the names of the laboratory analyses to be cleaned. If the argument is not specified, all the laboratory analyses are cleaned
 #' @param lab_target_units a string containing the path towards a csv files containing one record per each type of laboratory analysis in list_analyses and specifying the desired unit of measurement
-#' @param lab_unit_conversions a string containing the path towards a csv files containing the specifications to convert the values in the dataset to the target unit of measurement
+#' @param lab_unit_conversion a string containing the path towards a csv files containing the specifications to convert the values in the dataset to the target unit of measurement
 #' @param lab_thresholds a string containing the path towards a csv files containing the specifications of which values should be considered absurd and discarded, possibly depending on other variables such as age
-#' @param datasource (non mandatory) a string containing name of the datasource that can be stored in lab_unit_conversions to produce a datasource-specific assumption on what to do if the unit of measurement is missing
+#' @param datasource (non mandatory) a string containing name of the datasource that can be stored in `lab_unit_conversion` to produce a datasource-specific assumption on what to do if the unit of measurement is missing
+#' @return A `data.table` with the cleaned lab results and appended result columns.
+#'
+#' @export
 #'
 #'
 #' @details
@@ -35,11 +25,6 @@
 #
 
 CleanLabValuesDataset <- function(dataset, list_analyses = c(), lab_target_units, lab_unit_conversion, lab_thresholds, datasource = "") {
-  # Ensure check functions are available; source minimal checks if needed
-  if (!exists("check_dataset_model")) {
-    if (file.exists("R/check_metadata.R")) source("R/check_metadata.R") else stop("Missing R/check_metadata.R")
-  }
-
   # Basic validation of dataset and metadata files
   if (!is.data.frame(dataset) && !data.table::is.data.table(dataset)) stop("`dataset` must be a data.frame or data.table")
   for (varname in c("concept_id", "value", "unit")) if (!(varname %in% names(dataset))) stop(paste("dataset must contain column", varname))

diff --git a/R/check_metadata.R b/R/check_metadata.R
@@ -1,5 +1,14 @@
-# Functions for metadata and input checks
-
+#' Validate a condition string against a data.table
+#'
+#' Check that `cond` is a single, parseable R expression that only
+#' references columns present in `dt` and evaluates to a logical vector of
+#' length 1 or `nrow(dt)`.
+#'
+#' @param dt A `data.frame` or `data.table` containing the variables used in `cond`.
+#' @param cond A single string containing an R expression that can be evaluated
+#'   in the context of `dt` (e.g. "value > 0").
+#' @return `TRUE` if the condition is valid, `FALSE` otherwise.
+#' @keywords internal
 is_valid_dt_condition <- function(dt, cond) {
   if (!is.character(cond) || length(cond) != 1L || is.na(cond)) {
     return(FALSE)
@@ -26,6 +35,14 @@ is_valid_dt_condition <- function(dt, cond) {
   TRUE
 }
 
+#' Check dataset model
+#'
+#' Ensure `dataset` contains the minimal variables required by the
+#' cleaning pipeline (`concept_id`, `value`, `unit`).
+#'
+#' @param dataset A `data.frame` or `data.table` representing the input dataset.
+#' @return Invisibly returns `NULL` on success, otherwise throws an error.
+#' @keywords internal
 check_dataset_model <- function(dataset) {
   for (varname in c("concept_id", "value", "unit")) {
     if (!(varname %in% names(dataset))) {
@@ -49,6 +66,15 @@ check_dataset_model <- function(dataset) {
 #######################################################
 # lab_target_units
 
+#' Check LAB_target_units file
+#'
+#' Validate that the `LAB_target_units` CSV exists and contains at least
+#' `concept_id` and `unit_target` columns. Returns the parsed data.table
+#' invisibly on success.
+#'
+#' @param lab_target_units Path to the `LAB_target_units` CSV file.
+#' @return A `data.table` read from `lab_target_units` (invisibly).
+#' @keywords internal
 check_lab_target_units <- function(lab_target_units) {
   if (!(file.exists(lab_target_units))) {
     stop(paste("The file", lab_target_units, "cannot be found"))
@@ -65,6 +91,18 @@ check_lab_target_units <- function(lab_target_units) {
 #######################################################
 # lab_unit_conversion
 
+#' Check LAB_unit_conversion file
+#'
+#' Validate the unit conversion metadata file and basic consistency with
+#' `LAB_target_units`. Returns the parsed `data.table` invisibly.
+#'
+#' @param lab_unit_conversion Path to the `LAB_unit_conversion` CSV file.
+#' @param datasource Optional datasource identifier (string) used to check
+#'   for a `datasource` column when provided.
+#' @param list_analyses Character vector of `concept_id` values expected.
+#' @param target_unit Named character vector mapping `concept_id` -> `unit_target`.
+#' @return A `data.table` read from `lab_unit_conversion` (invisibly).
+#' @keywords internal
 check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_analyses, target_unit) {
   if (!(file.exists(lab_unit_conversion))) {
     stop(paste("The file", lab_unit_conversion, "cannot be found"))
@@ -255,6 +293,15 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal
 #####################################
 # lab_thresholds
 
+#' Check LAB_thresholds file
+#'
+#' Validate the `LAB_thresholds` CSV file structure and ensure `Min`/`Max`
+#' columns are numeric where required.
+#'
+#' @param lab_thresholds Path to the `LAB_thresholds` CSV file.
+#' @param dataset The input dataset (used to validate numeric variables referenced by thresholds).
+#' @return A `data.table` read from `lab_thresholds` (invisibly).
+#' @keywords internal
 check_lab_thresholds <- function(lab_thresholds, dataset) {
   if (!(file.exists(lab_thresholds))) {
     stop(paste("The file", lab_thresholds, "cannot be found"))

diff --git a/R/clean_lab_main.R b/R/clean_lab_main.R
@@ -1,11 +1,21 @@
-# Main cleaning function for lab values
+##' Main cleaning pipeline implementation
+#'
+#' Internal implementation of the lab cleaning pipeline. Use
+#' `CleanLabValuesDataset()` as the user-facing wrapper.
+#' @param dataset A `data.frame` or `data.table` with lab measurements.
+#' @param list_analyses Character vector of `concept_id` to process (default: all).
+#' @param lab_target_units Path to the `LAB_target_units` CSV file.
+#' @param lab_unit_conversion Path to the `LAB_unit_conversion` CSV file.
+#' @param lab_thresholds Path to the `LAB_thresholds` CSV file.
+#' @param datasource Optional datasource identifier (string).
+#' @return A `data.table` containing cleaned rows and appended result columns.
+#' - `included`: 1/0 whether the value is kept
+#' - `value`: cleaned/converted value when `included == 1`, otherwise NA
+#' - `unit_target`: the target unit assigned for the concept
+#' - `conversion`: integer code indicating conversion origin/type (0/1/2/3)
+#' - `rule_applied`: integer code indicating which rule was applied or failure reason
+#' @keywords internal
 #
-# Output columns appended by this pipeline (see README for full semantics):
-# - `included`: 1/0 whether the value is kept
-# - `value`: cleaned/converted value when `included == 1`, otherwise NA
-# - `unit_target`: the target unit assigned for the concept
-# - `conversion`: integer code indicating conversion origin/type (0/1/2/3)
-# - `rule_applied`: integer code indicating which rule was applied or failure reason
 clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_unit_conversion, lab_thresholds, datasource = "") {
   # Ensure input is a data.table and capture original input column order
   dataset <- data.table::as.data.table(dataset)

diff --git a/R/fill_missing_unit.R b/R/fill_missing_unit.R
@@ -1,4 +1,18 @@
-# Fill missing units in the dataset based on metadata
+## Fill missing units in the dataset based on metadata
+#' Fill missing unit values based on metadata rules
+#'
+#' Use `meta_unit_conv` and `target_unit` to populate a `unit_filled`
+#' column in `dt`. If an assumed unit is specified in the metadata it is
+#' used; otherwise the `target_unit` mapping is applied.
+#'
+#' @param dt A `data.table` with the measurements. Operates by reference and
+#'   returns the modified `data.table`.
+#' @param meta_unit_conv A `data.table` providing conversion/assumption rules.
+#' @param target_unit A named character vector mapping `concept_id` to `unit_target`.
+#' @param concept_id_col Name of the concept id column in `dt` (default: "concept_id").
+#' @param unit_col Name of the unit column in `dt` (default: "unit").
+#' @return The input `data.table` with a new `unit_filled` column.
+#' @keywords internal
 fill_missing_unit <- function(dt, meta_unit_conv, target_unit, concept_id_col = "concept_id", unit_col = "unit") {
   dt[, unit_filled := get(unit_col)]
   for (cid in unique(dt[[concept_id_col]])) {

diff --git a/R/load_dependencies.R b/R/load_dependencies.R
@@ -1,26 +1,10 @@
-# Load all CleanLabValues module scripts in a stable order
-load_cleanlab <- function() {
-  srcs <- c(
-    "R/check_metadata.R",
-    "R/fill_missing_unit.R",
-    "R/mo_convert.R",
-    "R/clean_lab_main.R",
-    "R/CleanLabValuesDataset.R"
-  )
-  # Try sourcing files relative to a few likely project roots so tests
-  # running from `tests/testthat` still find the R/ scripts.
-  roots <- c(".", "..", "..", file.path("..", ".."))
-  for (s in srcs) {
-    found <- FALSE
-    for (r in roots) {
-      p <- file.path(r, s)
-      if (file.exists(p)) {
-        source(p)
-        found <- TRUE
-        break
-      }
-    }
-    if (!found) stop(paste("Missing required file:", s))
-  }
-  invisible(TRUE)
-}
+##' Load dependencies (deprecated for package)
+##'
+##' This file historically contained `load_cleanlab()` which sourced the
+##' module scripts for interactive/testing workflows. In the installed
+##' package this manual sourcing is unnecessary because functions are
+##' exported via the `NAMESPACE` and loaded automatically. Keep this file
+##' as a documentation placeholder only.
+##'
+##' @keywords internal
+NULL
diff --git a/R/mo_convert.R b/R/mo_convert.R
@@ -1,31 +1,12 @@
-# Convert lab values to target units and flag inclusion, using data.table
-#
-# Output conventions (matching README):
-# - `conversion`: type of conversion from origin value to final value:
-#   - `0`: no conversion;
-#   - `1`: conversion from non-missing unit (conversion applied using origin unit rules);
-#   - `2`: conversion from `OTHER` unit. `OTHER` means the input `unit_origin` is
-#     a non-empty string but not listed among conversion rules. `OTHER` rows are
-#     processed through the same attempt/next_attempt chain as `MISSING` rows
-#     (prefilled assumed unit -> explicit next_attempt chain -> missing rows -> fallbacks),
-#     but successful results from the `OTHER` flow are marked `conversion = 2`.
-#   - `3`: conversion from `MISSING` unit (unit was empty/NA and missing-unit rules applied).
-#
-# - `rule_applied`: evaluation of the conversion:
-#   - `0`: no conversion needed and result accepted;
-#   - `1`: conversion needed and result accepted (single attempt accepted or direct match);
-#   - `2`: more than one conversion needed before acceptance (fallbacks);
-#   - `90`: no conversion possible and result discarded (no attempts applicable);
-#   - `91`: one conversion attempted before discarding the result;
-#   - `92`: more than one conversion attempted before discarding the result;
-#   - `99`: discarded because value is non-numeric.
-#
-# Notes:
-# - Attempt counting (`n_conversion_attempts`) is used internally to compute 90/91/92.
-# - The `OTHER` flow intentionally mirrors `MISSING` in attempt ordering and threshold
-#   decisioning; only the final `conversion` code differs (2 vs 3).
-
-## Internal helpers for mo_convert (not exported)
+#' Unit conversion helpers
+#'
+#' Internal helpers and the `mo_convert()` implementation used to convert
+#' lab values to target units and compute inclusion/conversion codes.
+#'
+#' These functions are internal to the package and are not exported.
+#'
+#' @keywords internal
+NULL
 .mo_norm <- function(x) tolower(trimws(as.character(x)))
 
 .mo_get_var_value <- function(dat, row_idx, varname) {
@@ -221,7 +202,17 @@
   FALSE
 }
 
-# Refactored mo_convert to incrementally try next_attempt (1, 2, ...) for each concept_id/unit_target, discarding if max next_attempt is reached
+##' Convert values using conversion metadata
+#'
+#' Apply conversion rules described in `metadata_convert` to the rows of
+#' `dat_unit_matched`. This returns a copy of the input with `included`,
+#' `value_converted`, `conversion`, and `rule_applied` columns populated.
+#'
+#' @param dat_unit_matched A `data.table` containing rows to convert. Must
+#'   include `concept_id`, `value`, `unit_origin`, and `unit_target`.
+#' @param metadata_convert A `data.table` specifying conversion attempts and thresholds.
+#' @return A `data.table` with conversion result columns added.
+#' @keywords internal
 mo_convert <- function(dat_unit_matched, metadata_convert) {
   # Clean, deterministic implementation of the conversion logic described in README.
   # For each row: try direct unit matches, then follow next_attempt order for missing units

diff --git a/man/CleanLabValuesDataset.Rd b/man/CleanLabValuesDataset.Rd
diff --git a/man/check_dataset_model.Rd b/man/check_dataset_model.Rd
diff --git a/man/check_lab_target_units.Rd b/man/check_lab_target_units.Rd
diff --git a/man/check_lab_thresholds.Rd b/man/check_lab_thresholds.Rd
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Generated by roxygen2: do not edit by hand

		export(CleanLabValuesDataset)