From 9d23b2759dc4000ae3d0f33a5c7a4c2585d8170b Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 06:52:45 -0400 Subject: [PATCH 01/18] Add v0.4 mock_spec foundation --- NAMESPACE | 6 + R/mock_spec.R | 477 +++++++++ development/adr/v04-hybrid-backend.md | 115 +++ development/simstudy-v04.md | 128 +++ development/v04-simstudy-spike/README.md | 155 --- development/v04-simstudy-spike/prototype.R | 1071 -------------------- tests/testthat/test-mock-spec.R | 143 +++ 7 files changed, 869 insertions(+), 1226 deletions(-) create mode 100644 R/mock_spec.R create mode 100644 development/adr/v04-hybrid-backend.md create mode 100644 development/simstudy-v04.md delete mode 100644 development/v04-simstudy-spike/README.md delete mode 100644 development/v04-simstudy-spike/prototype.R create mode 100644 tests/testthat/test-mock-spec.R diff --git a/NAMESPACE b/NAMESPACE index c3b3e55..33c9e9a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,7 +23,12 @@ export(get_variables_by_role) export(has_garbage) export(identify_derived_vars) export(import_from_recodeflow) +export(is_mock_spec) export(make_garbage) +export(mock_spec) +export(mock_spec_categorical) +export(mock_spec_continuous) +export(mock_spec_date) export(parse_range_notation) export(parse_variable_start) export(read_mock_data_config) @@ -31,6 +36,7 @@ export(read_mock_data_config_details) export(sample_with_proportions) export(validate_mock_data_config) export(validate_mock_data_config_details) +export(validate_mock_spec) export(validate_mockdata_metadata) importFrom(stats,rexp) importFrom(stats,rnorm) diff --git a/R/mock_spec.R b/R/mock_spec.R new file mode 100644 index 0000000..69b8a0a --- /dev/null +++ b/R/mock_spec.R @@ -0,0 +1,477 @@ +# ============================================================================== +# MockData v0.4 Specification Layer +# ============================================================================== +# Normalized internal representation for direct APIs, recodeflow adapters, and +# optional generation backends. +# ============================================================================== + +.mock_spec_version <- "0.4.0" + +.mock_spec_model_hints <- c( + "auto", + "native", + "simstudy", + "native-postprocess", + "simstudy-or-native", + "simstudy-advanced", + "diagnostic-required" +) + +`%||%` <- function(x, y) { + if (is.null(x)) y else x +} + +.normalize_provenance <- function(provenance, source = NULL) { + if (is.null(provenance)) { + provenance <- list(adapter = "direct", source = source %||% "direct") + } else if (!is.list(provenance)) { + provenance <- list(adapter = as.character(provenance), source = source %||% as.character(provenance)) + } + + if (is.null(provenance$adapter) || is.na(provenance$adapter) || provenance$adapter == "") { + provenance$adapter <- "unknown" + } + if (is.null(provenance$source) || is.na(provenance$source) || provenance$source == "") { + provenance$source <- provenance$adapter + } + + provenance +} + +.validate_model_hint <- function(model_hint) { + if (length(model_hint) != 1 || is.na(model_hint) || !model_hint %in% .mock_spec_model_hints) { + stop( + "model_hint must be one of: ", + paste(.mock_spec_model_hints, collapse = ", "), + call. = FALSE + ) + } + + invisible(TRUE) +} + +.new_mock_spec_variable <- function(name, + type, + rtype, + distribution = NULL, + range = NULL, + levels = NULL, + proportions = NULL, + formula = NULL, + missing_codes = character(0), + missing_proportions = numeric(0), + garbage_rules = list(), + source_format = NULL, + depends_on = character(0), + provenance = NULL, + model_hint = "auto", + ...) { + if (!is.character(name) || length(name) != 1 || is.na(name) || trimws(name) == "") { + stop("mock_spec variable name must be a non-empty string.", call. = FALSE) + } + if (!is.character(type) || length(type) != 1 || is.na(type) || trimws(type) == "") { + stop("mock_spec variable type must be a non-empty string.", call. = FALSE) + } + + .validate_model_hint(model_hint) + + structure( + c( + list( + name = name, + type = tolower(type), + rtype = tolower(rtype), + distribution = distribution, + range = range, + levels = levels, + proportions = proportions, + formula = formula, + missing_codes = missing_codes, + missing_proportions = missing_proportions, + garbage_rules = garbage_rules, + source_format = source_format, + depends_on = depends_on, + provenance = .normalize_provenance(provenance), + model_hint = model_hint + ), + list(...) + ), + class = c("mock_spec_variable", "list") + ) +} + +.as_mock_spec_variable_list <- function(...) { + variables <- list(...) + + if (length(variables) == 1 && is.null(variables[[1]])) { + return(list()) + } + + if (length(variables) == 1 && is.list(variables[[1]]) && !inherits(variables[[1]], "mock_spec_variable")) { + variables <- variables[[1]] + } + + if (length(variables) == 0) { + return(list()) + } + + if (!all(vapply(variables, inherits, logical(1), what = "mock_spec_variable"))) { + stop("mock_spec() inputs must be mock_spec_variable objects.", call. = FALSE) + } + + names(variables) <- vapply(variables, `[[`, character(1), "name") + variables +} + +#' Create a MockData specification +#' +#' `mock_spec()` creates the normalized v0.4 specification object used by the +#' new architecture. Direct APIs and recodeflow adapters should both normalize +#' into this shape before validation and generation. +#' +#' @param ... `mock_spec_variable` objects, or a single list of them. `NULL` +#' creates an empty specification. +#' @param spec_version Character version of the specification shape. +#' @param provenance List or character describing where the spec came from. +#' @param model_hint Character backend hint. One of the supported MockData model +#' hints. +#' +#' @return S3 object of class `mock_spec`. +#' @export +mock_spec <- function(..., + spec_version = .mock_spec_version, + provenance = list(adapter = "direct", source = "direct"), + model_hint = "auto") { + .validate_model_hint(model_hint) + + structure( + list( + spec_version = spec_version, + provenance = .normalize_provenance(provenance), + model_hint = model_hint, + variables = .as_mock_spec_variable_list(...) + ), + class = c("mock_spec", "list") + ) +} + +#' Create a continuous variable specification +#' +#' @param name Variable name. +#' @param range Numeric vector of length two giving the inclusive valid range. +#' @param distribution Distribution name. Defaults to `"uniform"`. +#' @param mean,sd Optional distribution parameters. +#' @param rtype R output type. Defaults to `"double"`. +#' @param missing_codes Explicit missing-code values. +#' @param missing_proportions Missing-code probabilities aligned to +#' `missing_codes`. +#' @param garbage_rules List of intentional invalid-value rules. +#' @param provenance Provenance metadata. +#' @param model_hint Backend hint. +#' +#' @return A `mock_spec_variable` object. +#' @export +mock_spec_continuous <- function(name, + range, + distribution = "uniform", + mean = NA_real_, + sd = NA_real_, + rtype = "double", + missing_codes = numeric(0), + missing_proportions = numeric(0), + garbage_rules = list(), + provenance = "direct", + model_hint = "auto") { + .new_mock_spec_variable( + name = name, + type = "continuous", + rtype = rtype, + distribution = distribution, + range = range, + mean = mean, + sd = sd, + missing_codes = missing_codes, + missing_proportions = missing_proportions, + garbage_rules = garbage_rules, + provenance = provenance, + model_hint = model_hint + ) +} + +#' Create a categorical variable specification +#' +#' @param name Variable name. +#' @param levels Character vector of valid levels or codes. +#' @param proportions Optional probabilities aligned to `levels`. +#' @param rtype R output type. Defaults to `"factor"`. +#' @param missing_codes Explicit missing-code values. +#' @param missing_proportions Missing-code probabilities aligned to +#' `missing_codes`. +#' @param garbage_rules List of intentional invalid-value rules. +#' @param provenance Provenance metadata. +#' @param model_hint Backend hint. +#' +#' @return A `mock_spec_variable` object. +#' @export +mock_spec_categorical <- function(name, + levels, + proportions = NULL, + rtype = "factor", + missing_codes = character(0), + missing_proportions = numeric(0), + garbage_rules = list(), + provenance = "direct", + model_hint = "auto") { + .new_mock_spec_variable( + name = name, + type = "categorical", + rtype = rtype, + distribution = "categorical", + levels = levels, + proportions = proportions, + missing_codes = missing_codes, + missing_proportions = missing_proportions, + garbage_rules = garbage_rules, + provenance = provenance, + model_hint = model_hint + ) +} + +#' Create a date variable specification +#' +#' @param name Variable name. +#' @param range Date vector of length two giving the inclusive valid date range. +#' @param rtype R output type. Defaults to `"date"`. +#' @param source_format Source-format hint. Defaults to `"analysis"`. +#' @param missing_codes Explicit missing-code values. +#' @param missing_proportions Missing-code probabilities aligned to +#' `missing_codes`. +#' @param garbage_rules List of intentional invalid-value rules. +#' @param provenance Provenance metadata. +#' @param model_hint Backend hint. +#' +#' @return A `mock_spec_variable` object. +#' @export +mock_spec_date <- function(name, + range, + rtype = "date", + source_format = "analysis", + missing_codes = character(0), + missing_proportions = numeric(0), + garbage_rules = list(), + provenance = "direct", + model_hint = "native-postprocess") { + .new_mock_spec_variable( + name = name, + type = "date", + rtype = rtype, + distribution = "uniform", + range = range, + source_format = source_format, + missing_codes = missing_codes, + missing_proportions = missing_proportions, + garbage_rules = garbage_rules, + provenance = provenance, + model_hint = model_hint + ) +} + +#' Check whether an object is a MockData specification +#' +#' @param x Object to check. +#' +#' @return Logical scalar. +#' @export +is_mock_spec <- function(x) { + inherits(x, "mock_spec") +} + +.new_mock_spec_validation_result <- function(valid = TRUE, + errors = character(0), + warnings = character(0), + info = character(0)) { + structure( + list( + valid = valid, + errors = errors, + warnings = warnings, + info = info + ), + class = c("mock_spec_validation_result", "list") + ) +} + +.validate_probability_vector <- function(values, label, allow_null = FALSE) { + errors <- character(0) + + if (is.null(values)) { + if (allow_null) { + return(errors) + } + return(paste0(label, " must not be NULL.")) + } + + if (!is.numeric(values)) { + errors <- c(errors, paste0(label, " must be numeric.")) + } else { + if (any(is.na(values))) { + errors <- c(errors, paste0(label, " must not contain NA values.")) + } + if (any(values < 0 | values > 1, na.rm = TRUE)) { + errors <- c(errors, paste0(label, " must be between 0 and 1.")) + } + } + + errors +} + +.validate_missing_spec <- function(variable) { + errors <- character(0) + + if (length(variable$missing_codes) == 0 && length(variable$missing_proportions) == 0) { + return(errors) + } + + if (length(variable$missing_codes) != length(variable$missing_proportions)) { + errors <- c(errors, paste0( + "Variable '", variable$name, + "' must have one missing proportion per missing code." + )) + } + + errors <- c(errors, .validate_probability_vector( + variable$missing_proportions, + paste0("Variable '", variable$name, "' missing_proportions"), + allow_null = FALSE + )) + + missing_sum <- sum(variable$missing_proportions, na.rm = TRUE) + if (missing_sum > 1) { + errors <- c(errors, paste0( + "Variable '", variable$name, + "' missing proportions must sum to <= 1." + )) + } + + errors +} + +.validate_range <- function(range, variable_name, expected_class = "numeric") { + errors <- character(0) + + if (is.null(range) || length(range) != 2) { + return(paste0("Variable '", variable_name, "' range must have length 2.")) + } + + if (expected_class == "Date") { + if (!inherits(range, "Date")) { + errors <- c(errors, paste0("Variable '", variable_name, "' range must be Date.")) + } + } else if (!is.numeric(range)) { + errors <- c(errors, paste0("Variable '", variable_name, "' range must be numeric.")) + } + + if (any(is.na(range))) { + errors <- c(errors, paste0("Variable '", variable_name, "' range must not contain NA values.")) + } else if (range[[1]] > range[[2]]) { + errors <- c(errors, paste0("Variable '", variable_name, "' range lower bound must be <= upper bound.")) + } + + errors +} + +.validate_mock_spec_variable <- function(variable) { + errors <- character(0) + + if (!inherits(variable, "mock_spec_variable")) { + return("All mock_spec variables must inherit from mock_spec_variable.") + } + + errors <- c(errors, .validate_missing_spec(variable)) + + if (variable$type == "continuous") { + errors <- c(errors, .validate_range(variable$range, variable$name, "numeric")) + if (identical(variable$distribution, "normal")) { + if (is.null(variable$mean) || length(variable$mean) != 1 || is.na(variable$mean)) { + errors <- c(errors, paste0("Variable '", variable$name, "' normal distribution requires mean.")) + } + if (is.null(variable$sd) || length(variable$sd) != 1 || is.na(variable$sd) || variable$sd <= 0) { + errors <- c(errors, paste0("Variable '", variable$name, "' normal distribution requires sd > 0.")) + } + } + } else if (variable$type == "categorical") { + if (is.null(variable$levels) || length(variable$levels) == 0) { + errors <- c(errors, paste0("Variable '", variable$name, "' must have at least one level.")) + } + if (!is.null(variable$proportions)) { + if (length(variable$levels) != length(variable$proportions)) { + errors <- c(errors, paste0("Variable '", variable$name, "' must have one proportion per level.")) + } + errors <- c(errors, .validate_probability_vector( + variable$proportions, + paste0("Variable '", variable$name, "' proportions"), + allow_null = FALSE + )) + prop_sum <- sum(variable$proportions, na.rm = TRUE) + if (abs(prop_sum - 1) > 0.001) { + errors <- c(errors, paste0("Variable '", variable$name, "' proportions must sum to 1.")) + } + } + } else if (variable$type == "date") { + errors <- c(errors, .validate_range(variable$range, variable$name, "Date")) + } else { + errors <- c(errors, paste0("Variable '", variable$name, "' has unsupported type '", variable$type, "'.")) + } + + errors +} + +#' Validate a MockData specification +#' +#' @param spec A `mock_spec` object. +#' @param n Optional number of rows expected for generation. If supplied, must +#' be a non-negative whole number. +#' @param strict Logical. If `TRUE`, invalid specs throw an error. If `FALSE`, +#' a validation result object is returned. +#' +#' @return A `mock_spec_validation_result` object when valid or `strict = FALSE`. +#' @export +validate_mock_spec <- function(spec, n = NULL, strict = TRUE) { + errors <- character(0) + warnings <- character(0) + info <- character(0) + + if (!is_mock_spec(spec)) { + errors <- c(errors, "spec must be a mock_spec object.") + } else { + if (is.null(spec$spec_version) || length(spec$spec_version) != 1 || is.na(spec$spec_version)) { + errors <- c(errors, "mock_spec must have a scalar spec_version.") + } + if (is.null(spec$variables) || !is.list(spec$variables)) { + errors <- c(errors, "mock_spec variables must be a list.") + } else { + variable_names <- names(spec$variables) + if (length(variable_names) != length(unique(variable_names))) { + errors <- c(errors, "mock_spec variable names must be unique.") + } + for (variable in spec$variables) { + errors <- c(errors, .validate_mock_spec_variable(variable)) + } + } + } + + if (!is.null(n)) { + if (!is.numeric(n) || length(n) != 1 || is.na(n) || n < 0 || n != floor(n)) { + errors <- c(errors, "n must be a non-negative whole number.") + } + } + + valid <- length(errors) == 0 + result <- .new_mock_spec_validation_result(valid, errors, warnings, info) + + if (!valid && isTRUE(strict)) { + stop(paste(errors, collapse = "\n"), call. = FALSE) + } + + result +} diff --git a/development/adr/v04-hybrid-backend.md b/development/adr/v04-hybrid-backend.md new file mode 100644 index 0000000..eca4420 --- /dev/null +++ b/development/adr/v04-hybrid-backend.md @@ -0,0 +1,115 @@ +# ADR: v0.4 Hybrid Backend Architecture + +**Status**: draft +**Date**: 2026-05-18 +**Decision owner**: MockData maintainers + +## Context + +MockData began as an experiment for generating mock testing data from +recodeflow-style metadata. It now supports categorical, continuous, date, +garbage-data, and survival-style examples. People are using it, and it is +becoming part of the recodeflow/cchsflow/chmsflow adoption path. + +The v0.3 architecture grew organically. The current generators filter metadata, +parse ranges, infer generation parameters, generate values, apply missing codes, +inject garbage, coerce types, and return columns. That made early development +fast, but it makes cross-variable structure, validation, diagnostics, and +backend selection harder to reason about. + +The v0.4 spike tested whether MockData can normalize user inputs into a +`mock_spec`, generate through either a native backend or `simstudy`, and keep +MockData-specific semantics as post-processing. Three review rounds converged on +the same conclusion: the hybrid architecture is ready for production refactor +planning. + +## Decision + +MockData v0.4 will move toward a hybrid backend architecture: + +- `mock_spec` is the normalized internal representation. +- Native MockData generation is the default backend and must work without + `simstudy`. +- `simstudy` is an optional advanced backend for features where it clearly helps, + including formula dependencies, correlations, survival durations, and mature + simulation mechanics. +- MockData remains responsible for recodeflow semantics, simple direct APIs, + validation, explicit missing-code conventions, garbage/invalid data, + diagnostics, date/source-format conversion, and calendar anchoring. +- `mock_spec` carries `spec_version`, `provenance`, and `model_hint` to preserve + adapter agnosticism. + +## License And Dependency Posture + +MockData remains MIT. `simstudy` is GPL-3, so it will initially be kept optional +in `Suggests` and accessed through `requireNamespace()`. + +Importing `simstudy` as a required dependency would require a conscious future +governance decision. + +## API And Deprecation + +Current public functions remain available in v0.4.0: + +- `create_mock_data()` +- `create_cat_var()` +- `create_con_var()` +- `create_date_var()` +- `create_wide_survival_data()` + +These functions should become wrappers around the new layered internals where +possible. They should not be removed in v0.4.0. + +Deprecation policy: + +- No removal before v0.5.0. +- Lifecycle deprecation warnings may be added during v0.4.x only after sibling + package maintainers have had a migration path. +- `NEWS.md` must include v0.4.0 migration notes and any deprecation timeline. + +## Non-Goals + +MockData will not market itself as synthetic data for inference, privacy release, +or population-valid statistical analysis. It generates mock data for code +development, QA, documentation, examples, and training. + +## Consequences + +Positive: + +- Recodeflow support remains central. +- Simple users can use direct APIs without learning `simstudy`. +- Advanced users can benefit from a mature simulation backend. +- Missing codes, garbage data, dates, and diagnostics stay MockData-owned. +- Native generation keeps the package usable where optional dependencies are not + installed. + +Tradeoffs: + +- The package needs a real internal spec model. +- Backends must be tested for parity where both support the same feature. +- Some spike contracts need production design: diagnostics, date offsets, + formula syntax, custom distribution registry, and correlation merging. +- Maintaining wrappers will add short-term complexity. + +## Implementation Direction + +Production refactor should proceed in layers: + +1. `mock_spec` constructors and validators. +2. Direct and recodeflow input adapters. +3. Formula/dependency evaluator. +4. Native backend. +5. Post-processing layer. +6. Promotion of spike assertions to `testthat`. +7. Optional `simstudy` backend. +8. Current API wrappers. + +## Open Follow-Up Decisions + +- Multi-group correlation merge strategy. +- Diagnostics object shape. +- Whether `mock_spec` is internal-only or partially user-facing in v0.4.0. +- How formula/dependency syntax enters from recodeflow or direct APIs. +- How Table 1 / summary specifications become a future adapter. + diff --git a/development/simstudy-v04.md b/development/simstudy-v04.md new file mode 100644 index 0000000..5772839 --- /dev/null +++ b/development/simstudy-v04.md @@ -0,0 +1,128 @@ +# MockData v0.4 Production Refactor Plan + +## 1. Write The ADR First + +Write a short architecture decision record before production code changes. + +The ADR should lock these decisions: + +- **Decision**: MockData adopts a hybrid backend architecture. +- **Core abstraction**: `mock_spec` is the normalized internal specification. +- **Forward compatibility**: `mock_spec` carries `spec_version`, `provenance`, + and `model_hint` so direct APIs, recodeflow adapters, and future adapters can + share one internal representation. +- **Default backend**: native MockData generation remains the default and must + work without `simstudy`. +- **Optional backend**: `simstudy` is an advanced backend, initially in + `Suggests`, gated with `requireNamespace()`. +- **License posture**: keep MockData MIT by keeping `simstudy` optional unless a + future governance decision changes that. +- **Version target**: v0.4.0. +- **NEWS commitment**: `NEWS.md` gets a v0.4.0 section with breaking changes, + the new spec model, migration notes, and deprecated functions. +- **Current API timeline**: existing public functions remain as wrappers through + v0.4.0. They may be marked lifecycle-deprecated in v0.4.x after sibling + packages have migrated, and removed no earlier than v0.5.0. +- **Non-goal**: MockData remains mock data for code development, QA, and + documentation. It is not marketed as synthetic data for inference or privacy + release. + +## 2. Implement In Layers + +Each layer should have focused tests before the next layer starts. + +1. **`mock_spec` core** + - Constructors and validators. + - Stable fields for names, types, ranges, levels, proportions, missing codes, + garbage rules, formulas, dates, and backend hints. + - Explicit handling for empty specs, `NULL` metadata, single-row specs, and + `n = 0`. + +2. **Input adapters** + - Direct function-argument APIs to `mock_spec`. + - Recodeflow `variables` + `variable_details` adapter to `mock_spec`. + - Preserve recodeflow semantics as first-class behavior. + +3. **Formula/dependency evaluator** + - Promote the spike pattern to core: formula referent validation, topological + ordering, cycle detection, and sandboxed evaluation in a generated-data + environment. + +4. **Native backend** + - Generate valid baseline values from `mock_spec`. + - Keep native support for the simple/core path without `simstudy`. + - Add multi-group correlation strategy, including merge behavior with + ordinary variables. + +5. **Post-processing layer** + - Missing codes. + - Garbage values. + - `rType` coercion. + - Date/source-format conversion. + - Diagnostics contract. + - Replace the legacy `var_row` garbage shim with typed garbage specs. + +6. **Spike assertion promotion** + - Promote the strongest spike assertions into `testthat`, especially: + categorical code/label preservation, missing-code collision diagnostics, + seed reproducibility, censor/event date invariants, recEnd-driven + missingness, formula dependency validation, and correlation contracts. + +7. **Optional `simstudy` backend** + - Translate supported `mock_spec` pieces to `simstudy` definitions. + - Keep `simstudy` optional with clear errors when unavailable. + - Test native/simstudy parity for column names, types, reproducibility, and + expected statistical contracts. + +8. **Orchestrator wrappers** + - Gradually replace current dispatch internals. + - Keep `create_mock_data()`, `create_cat_var()`, `create_con_var()`, and + `create_date_var()` alive as wrappers during transition. + +## 3. Keep The Current API Alive + +Existing public functions should remain available in v0.4.0: + +- `create_mock_data()` +- `create_cat_var()` +- `create_con_var()` +- `create_date_var()` +- `create_wide_survival_data()` + +These should call the new layered internals where possible. Migration should be +incremental so cchsflow, chmsflow, and recodeflow users do not need a +synchronized release. + +## 4. Carry-Forward Design Issues + +Settle in the ADR or the first design note: + +- Multi-group correlation merge strategy. +- Whether `mock_spec` is internal-only in v0.4.0 or partially user-facing. +- Diagnostics object shape and stability. +- `simstudy` dependency posture after governance review. +- Deprecation schedule for current public wrappers. + +Track as implementation issues: + +- Empty / `NULL` / single-row input behavior. +- Date `__offset` convention and whether it remains internal. +- Garbage `var_row` shim replacement. +- Distribution registry for custom backend functions. +- Seed discipline between baseline generation and post-processing. +- Native vs `simstudy` backend equivalence tests. +- Event/censoring rate tests with two-sided bounds. +- Table 1 / summary-spec source as a future adapter. + +## 5. Communication + +Before v0.4.0 lands, write a short communication note for cchsflow, chmsflow, +and recodeflow maintainers: + +- What changes. +- What does not change. +- Which functions remain available. +- What migration is optional in v0.4.0. +- When deprecation warnings may begin. +- How the mock-data framing remains distinct from synthetic-data release. + diff --git a/development/v04-simstudy-spike/README.md b/development/v04-simstudy-spike/README.md deleted file mode 100644 index 42ca5c1..0000000 --- a/development/v04-simstudy-spike/README.md +++ /dev/null @@ -1,155 +0,0 @@ -# MockData v0.4 simstudy Spike - -This directory contains a disposable architecture spike for MockData v0.4. -It is intentionally outside the package build and is excluded by -`.Rbuildignore`. - -The working thesis: - -> MockData remains the recodeflow-native, MIT-licensed interface for practical -> mock data. `simstudy` is evaluated as an optional advanced engine for cases -> where it clearly improves robustness, performance, or modeling capability. - -## Run - -Install `simstudy` into a temporary library, then run: - -```r -lib <- "/private/tmp/mockdata-simstudy-lib" -dir.create(lib, recursive = TRUE, showWarnings = FALSE) -install.packages("simstudy", lib = lib, repos = "https://cloud.r-project.org") - -source("development/v04-simstudy-spike/prototype.R") -``` - -The prototype expects that temporary library path by default. It does not -modify `DESCRIPTION`, `renv.lock`, or the package library. - -To exercise the native fallback path without `simstudy`, point the temporary -library variable at an empty path: - -```sh -MOCKDATA_SIMSTUDY_LIB=/private/tmp/no-simstudy-lib \ - Rscript --vanilla development/v04-simstudy-spike/prototype.R -``` - -## What This Prototype Tests - -- Recodeflow-style metadata can normalize into a small internal `mock_spec`. -- The same `mock_spec` can translate to `simstudy::defData()` definitions for - age, smoking, interview date offsets, and one formula dependency. -- `simstudy` can preserve recodeflow categorical codes via categorical levels, - and both backends can also generate non-numeric categorical labels such as - `"never"`, `"former"`, and `"current"`. -- Truncated normal age generation can be handled with a `simstudy` custom - distribution while MockData still owns range parsing and rType coercion. -- MockData-style explicit missing codes and garbage values can remain - post-processing after baseline valid-value generation. -- Correlated height/weight generation is straightforward through - `simstudy::genCorData()` when correlation parameters are declared in - `mock_spec` and translated through the backend definition layer. -- Survival durations can be generated through `simstudy::defSurv()` / - `simstudy::genSurv()` and then anchored back to MockData-owned calendar - dates. -- The same `mock_spec` can drive a native MockData-style generation path when - `simstudy` is absent. -- Missing-code collisions are detectable if post-processing preserves assignment - diagnostics. This matters when a valid drawn value can equal an explicit - missing code. -- Seed reproducibility can be asserted across native and `simstudy` paths. -- Formula dependencies can be validated for missing referents and sorted so - formula variables are generated after their inputs. -- Truncated-normal boundary collapse now fails loudly instead of returning - `NaN`. - -## Early Read - -This first pass supports the hybrid design: - -- `simstudy` looks strong as a generation engine. -- MockData should still own recodeflow semantics, direct simple APIs, validation, - missing-code conventions, garbage data, source formats, and calendar anchoring. -- The normalized `mock_spec` abstraction is useful enough to keep testing. - -Open questions remain around dependency/license posture, error wrapping, -structural constraints, and whether ordinary simple variables should use native -MockData generation or route through `simstudy`. - -## License and Dependency Posture - -`simstudy` is GPL-3. MockData is currently MIT. This spike treats `simstudy` as -an optional advanced backend rather than a required package dependency. A future -architecture decision needs to explicitly decide whether: - -- MockData keeps `simstudy` in `Suggests` with a soft `requireNamespace()` gate. -- MockData imports `simstudy` and accepts the license/governance implications. -- MockData keeps a native engine and only borrows design ideas from `simstudy`. - -The current prototype supports the first option technically: the native fallback -path runs without loading `simstudy`. - -## Prototype Contracts Surfaced - -- `model_hint` is currently a small enum in the prototype rather than an - unrestricted string. -- `provenance` is stored as structured metadata and displayed compactly in the - printed spec table. -- `mockdata_diagnostics` is the prototype mechanism for preserving assignment - state after post-processing. This is what lets MockData distinguish a value - that was drawn as valid from the same value assigned as an explicit missing - code. -- Survival date columns use mutually exclusive `event_date` and `censor_date` - values. Event rows do not also receive a censoring date. - -## Review Gaps Addressed After First PR Review - -- Added `spec_version`, `provenance`, and `model_hint` fields. -- Added a fail-loud `from_linkml()` placeholder to keep a future third adapter - visible without pretending it is implemented. -- Added native backend generation from the same `mock_spec`. -- Added a missing-code collision case where `97` can be both a valid generated - value and an assigned missing code. -- Moved the height/weight correlation example into a `mock_spec` declaration. -- Added seed reproducibility assertions. - -## Review Gaps Addressed After Second PR Review - -- Fixed the survival censoring semantic bug where event rows also had - `censor_date` populated. -- Routed correlation parameters through `mock_spec` and the backend definition - layer, with both `simstudy` and native Cholesky-based generation paths. -- Added formula referent validation and dependency ordering. -- Added statistical-contract assertions for truncated normal moments, - categorical proportions, garbage rates, and correlation marginals. -- Added non-numeric categorical label coverage. - -## Remaining Questions Before Production Refactor - -- Whether `mockdata_diagnostics` should become a formal internal contract or a - different diagnostics object. -- How much of the prototype `mock_spec` should become user-facing for advanced - users. -- Whether formula/dependency syntax should come from recodeflow metadata, - direct MockData arguments, or a future third adapter. -- Whether `simstudy` remains a `Suggests` backend long term or becomes a - stronger package dependency after governance review. - -## Internal Contracts Not Yet Generalized - -The prototype deliberately encodes a few implementation contracts that are -useful evidence, but not production-ready API decisions: - -- Date variables use hidden `__offset` companion columns during generation, then - convert those offsets to calendar dates in post-processing. -- Custom `simstudy` distributions are resolved by global function name, as with - `mockdata_rtrunc_norm`; production code likely needs an explicit distribution - registry. -- Formula variables are added directly to `mock_spec` in the spike rather than - parsed from recodeflow metadata. -- Garbage post-processing still rebuilds a row-shaped `var_row` object to reuse - the v0.3 `apply_garbage()` helper. -- The prototype uses `seed + 1` for post-processing so baseline generation and - missing/garbage assignment are reproducible but distinct. -- Correlated variables currently use a separate backend path from ordinary - `defData()` generation; production code needs a merge strategy for multiple - correlation groups and ordinary variables. diff --git a/development/v04-simstudy-spike/prototype.R b/development/v04-simstudy-spike/prototype.R deleted file mode 100644 index da5bbd3..0000000 --- a/development/v04-simstudy-spike/prototype.R +++ /dev/null @@ -1,1071 +0,0 @@ -# MockData v0.4 simstudy architecture spike. -# -# This is deliberately a prototype, not package code. It tests whether a small -# normalized mock_spec can sit between recodeflow metadata and simstudy. - -local({ - spike_lib <- Sys.getenv("MOCKDATA_SIMSTUDY_LIB", "/private/tmp/mockdata-simstudy-lib") - if (dir.exists(spike_lib)) { - .libPaths(c(spike_lib, .libPaths())) - } -}) - -simstudy_available <- requireNamespace("simstudy", quietly = TRUE) - -source("R/mockdata-parsers.R", local = TRUE) -source("R/mockdata_helpers.R", local = TRUE) - -MODEL_HINTS <- c( - "hybrid", - "auto", - "native-postprocess", - "simstudy-or-native", - "simstudy-advanced", - "diagnostic-required" -) - -`%||%` <- function(x, y) { - if (is.null(x)) y else x -} - -mockdata_rtrunc_norm <- function(n, min, max, mu, s) { - if (any(!is.finite(min)) || any(!is.finite(max)) || any(min >= max)) { - stop("Truncated normal requires finite min < max.", call. = FALSE) - } - - f_min <- stats::pnorm(min, mean = mu, sd = s) - f_max <- stats::pnorm(max, mean = mu, sd = s) - if (any(!is.finite(f_min)) || any(!is.finite(f_max)) || any(f_min >= f_max)) { - stop("Truncated normal bounds collapse to an empty probability interval.", call. = FALSE) - } - - stats::qnorm(stats::runif(n, min = f_min, max = f_max), mean = mu, sd = s) -} - -new_mock_spec <- function(vars, - spec_version = "0.4-spike-1", - provenance = list(adapter = "mixed", source = "prototype"), - model_hint = "hybrid", - correlation_groups = list()) { - validate_model_hint(model_hint) - - structure( - vars, - class = "mock_spec", - spec_version = spec_version, - provenance = provenance, - model_hint = model_hint, - correlation_groups = correlation_groups - ) -} - -add_spec_metadata <- function(var, - spec_version = "0.4-spike-1", - provenance = "direct", - model_hint = "auto") { - validate_model_hint(model_hint) - - var$spec_version <- spec_version - var$provenance <- normalize_provenance(provenance) - var$model_hint <- model_hint - var -} - -normalize_provenance <- function(provenance) { - if (is.list(provenance)) { - return(provenance) - } - - list(adapter = provenance, source = provenance) -} - -format_provenance <- function(provenance) { - provenance <- normalize_provenance(provenance) - values <- unique(unname(unlist(provenance, use.names = FALSE))) - paste(values, collapse = "/") -} - -validate_model_hint <- function(model_hint) { - if (!model_hint %in% MODEL_HINTS) { - stop( - "Unknown model_hint: ", model_hint, - ". Expected one of: ", paste(MODEL_HINTS, collapse = ", "), - call. = FALSE - ) - } - - invisible(TRUE) -} - -mock_spec_continuous <- function(name, - range, - distribution = "uniform", - mean = NA_real_, - sd = NA_real_, - rtype = "double", - missing_codes = numeric(0), - missing_proportions = numeric(0), - garbage = list(), - source = "direct", - provenance = source, - model_hint = "auto", - correlation_group = NA_character_) { - add_spec_metadata(list( - name = name, - type = "continuous", - rtype = rtype, - distribution = distribution, - range = range, - mean = mean, - sd = sd, - levels = NULL, - proportions = NULL, - formula = NULL, - missing_codes = missing_codes, - missing_proportions = missing_proportions, - garbage = garbage, - source = source, - correlation_group = correlation_group - ), provenance = provenance, model_hint = model_hint) -} - -mock_spec_categorical <- function(name, - levels, - proportions = NULL, - rtype = "factor", - missing_codes = character(0), - missing_proportions = numeric(0), - garbage = list(), - source = "direct", - provenance = source, - model_hint = "auto") { - if (is.null(proportions)) { - proportions <- rep(1 / length(levels), length(levels)) - } - - add_spec_metadata(list( - name = name, - type = "categorical", - rtype = rtype, - distribution = "categorical", - range = NULL, - mean = NA_real_, - sd = NA_real_, - levels = levels, - proportions = proportions, - formula = NULL, - missing_codes = missing_codes, - missing_proportions = missing_proportions, - garbage = garbage, - source = source - ), provenance = provenance, model_hint = model_hint) -} - -mock_spec_date <- function(name, - range, - rtype = "date", - source_format = "analysis", - source = "direct", - provenance = source, - model_hint = "native-postprocess") { - add_spec_metadata(list( - name = name, - type = "date", - rtype = rtype, - distribution = "uniform", - range = range, - mean = NA_real_, - sd = NA_real_, - levels = NULL, - proportions = NULL, - formula = NULL, - missing_codes = character(0), - missing_proportions = numeric(0), - garbage = list(), - source_format = source_format, - source = source - ), provenance = provenance, model_hint = model_hint) -} - -mock_spec_binary_formula <- function(name, formula, rtype = "integer") { - add_spec_metadata(list( - name = name, - type = "binary_formula", - rtype = rtype, - distribution = "binary", - range = c(0, 1), - mean = NA_real_, - sd = NA_real_, - levels = NULL, - proportions = NULL, - formula = formula, - missing_codes = character(0), - missing_proportions = numeric(0), - garbage = list(), - source = "formula" - ), provenance = "formula", model_hint = "simstudy-or-native") -} - -mock_spec_correlated_continuous <- function(name, - mean, - sd, - correlation_group, - range = c(-Inf, Inf), - rtype = "double") { - mock_spec_continuous( - name = name, - range = range, - distribution = "correlated_normal", - mean = mean, - sd = sd, - rtype = rtype, - source = "correlation_spec", - provenance = "direct", - model_hint = "simstudy-advanced", - correlation_group = correlation_group - ) -} - -spec_table <- function(spec) { - data.frame( - name = vapply(spec, `[[`, character(1), "name"), - type = vapply(spec, `[[`, character(1), "type"), - rtype = vapply(spec, `[[`, character(1), "rtype"), - distribution = vapply(spec, `[[`, character(1), "distribution"), - source = vapply(spec, `[[`, character(1), "source"), - provenance = vapply(spec, function(x) format_provenance(x$provenance), character(1)), - model_hint = vapply(spec, `[[`, character(1), "model_hint"), - stringsAsFactors = FALSE - ) -} - -first_range <- function(details_subset) { - for (value in details_subset$recStart) { - parsed <- parse_range_notation(value) - if (!is.null(parsed) && parsed$type %in% c("integer", "continuous", "date")) { - return(c(parsed$min, parsed$max)) - } - } - - NULL -} - -extract_missing <- function(details_subset) { - missing_rows <- details_subset[ - grepl("^NA::", details_subset$recEnd %||% "", ignore.case = TRUE), - ] - - if (nrow(missing_rows) == 0) { - return(list(codes = character(0), proportions = numeric(0))) - } - - props <- missing_rows$proportion - props[is.na(props)] <- 0 - list( - codes = stats::setNames(missing_rows$recStart, missing_rows$recStart), - proportions = stats::setNames(props, missing_rows$recStart) - ) -} - -extract_garbage <- function(var_row) { - fields <- c( - "garbage_low_prop", "garbage_low_range", - "garbage_high_prop", "garbage_high_range" - ) - fields <- fields[fields %in% names(var_row)] - stats::setNames(as.list(var_row[1, fields, drop = TRUE]), fields) -} - -as_mock_spec_from_recodeflow <- function(variables, variable_details, databaseStart) { - out <- list() - - for (i in seq_len(nrow(variables))) { - var_row <- variables[i, ] - name <- var_row$variable - details_subset <- variable_details[ - variable_details$variable == name & - .database_start_matches(variable_details$databaseStart, databaseStart, allow_empty = TRUE), - ] - - type <- tolower(var_row$variableType) - rtype <- tolower(var_row$rType) - missing <- extract_missing(details_subset) - garbage <- extract_garbage(var_row) - - if (type %in% c("continuous", "integer", "numeric")) { - out[[name]] <- mock_spec_continuous( - name = name, - range = first_range(details_subset), - distribution = if ("distribution" %in% names(var_row)) var_row$distribution else "uniform", - mean = if ("mean" %in% names(var_row)) var_row$mean else NA_real_, - sd = if ("sd" %in% names(var_row)) var_row$sd else NA_real_, - rtype = rtype, - missing_codes = missing$codes, - missing_proportions = missing$proportions, - garbage = garbage, - source = "recodeflow" - ) - } else if (type %in% c("categorical", "factor")) { - props <- extract_proportions(details_subset, name) - missing_codes <- stats::setNames(names(props$missing), names(props$missing)) - out[[name]] <- mock_spec_categorical( - name = name, - levels = props$categories, - proportions = props$category_proportions, - rtype = rtype, - missing_codes = missing_codes, - missing_proportions = unlist(props$missing, use.names = TRUE), - garbage = garbage, - source = "recodeflow" - ) - } else if (type == "date") { - out[[name]] <- mock_spec_date( - name = name, - range = first_range(details_subset), - rtype = rtype, - source = "recodeflow" - ) - } else { - stop("Unsupported prototype variableType: ", var_row$variableType, call. = FALSE) - } - } - - new_mock_spec(out) -} - -simstudy_formula <- function(x) { - paste(x, collapse = ";") -} - -formula_dependencies <- function(var) { - if (is.null(var$formula) || is.na(var$formula)) { - return(character(0)) - } - - all.vars(str2lang(var$formula)) -} - -order_spec_by_dependencies <- function(spec) { - remaining <- names(spec) - ordered <- character(0) - - while (length(remaining) > 0) { - progressed <- FALSE - - for (name in remaining) { - deps <- intersect(formula_dependencies(spec[[name]]), names(spec)) - if (all(deps %in% ordered)) { - ordered <- c(ordered, name) - remaining <- setdiff(remaining, name) - progressed <- TRUE - } - } - - if (!progressed) { - stop( - "Formula dependency cycle or unresolved ordering among: ", - paste(remaining, collapse = ", "), - call. = FALSE - ) - } - } - - new_mock_spec( - spec[ordered], - spec_version = attr(spec, "spec_version"), - provenance = attr(spec, "provenance"), - model_hint = attr(spec, "model_hint"), - correlation_groups = attr(spec, "correlation_groups") %||% list() - ) -} - -validate_formula_referents <- function(spec) { - spec_names <- names(spec) - - for (var in spec) { - missing <- setdiff(formula_dependencies(var), spec_names) - if (length(missing) > 0) { - stop( - "Formula for variable '", var$name, "' references unknown variable(s): ", - paste(missing, collapse = ", "), - call. = FALSE - ) - } - } - - invisible(TRUE) -} - -correlation_defs_from_spec <- function(spec) { - groups <- attr(spec, "correlation_groups") %||% list() - group_names <- unique(na.omit(vapply( - spec, - function(var) var$correlation_group %||% NA_character_, - character(1) - ))) - - lapply(stats::setNames(group_names, group_names), function(group_name) { - vars <- spec[vapply( - spec, - function(var) identical(var$correlation_group %||% NA_character_, group_name), - logical(1) - )] - config <- groups[[group_name]] %||% list(rho = 0, corstr = "cs") - - list( - group = group_name, - names = vapply(vars, `[[`, character(1), "name"), - means = vapply(vars, `[[`, numeric(1), "mean"), - sds = vapply(vars, `[[`, numeric(1), "sd"), - rho = config$rho %||% 0, - corstr = config$corstr %||% "cs" - ) - }) -} - -as_simstudy_def <- function(spec) { - validate_formula_referents(spec) - spec <- order_spec_by_dependencies(spec) - def <- NULL - - for (var in spec) { - if (identical(var$distribution, "correlated_normal")) { - next - } - - if (!simstudy_available) { - stop("simstudy is not available; use backend = 'native' for this spike.", call. = FALSE) - } - - if (var$type == "continuous") { - range <- var$range - if (var$distribution == "normal") { - def <- simstudy::defData( - def, - varname = var$name, - formula = "mockdata_rtrunc_norm", - variance = paste0( - "min = ", range[[1]], - ", max = ", range[[2]], - ", mu = ", var$mean, - ", s = ", var$sd - ), - dist = "custom" - ) - } else if (var$rtype == "integer") { - def <- simstudy::defData( - def, - varname = var$name, - formula = simstudy_formula(range), - dist = "uniformInt" - ) - } else { - def <- simstudy::defData( - def, - varname = var$name, - formula = simstudy_formula(range), - dist = "uniform" - ) - } - } else if (var$type == "categorical") { - def <- simstudy::defData( - def, - varname = var$name, - formula = simstudy_formula(var$proportions), - variance = simstudy_formula(var$levels), - dist = "categorical" - ) - } else if (var$type == "date") { - days <- as.integer(var$range[[2]] - var$range[[1]]) - def <- simstudy::defData( - def, - varname = paste0(var$name, "__offset"), - formula = paste0("0;", days), - dist = "uniformInt" - ) - } else if (var$type == "binary_formula") { - def <- simstudy::defData( - def, - varname = var$name, - formula = var$formula, - dist = "binary", - link = "logit" - ) - } - } - - structure( - list(data_def = def, correlation_groups = correlation_defs_from_spec(spec)), - class = "mock_simstudy_def" - ) -} - -generate_mock_data_simstudy <- function(spec, n, seed = NULL) { - if (!simstudy_available) { - stop("simstudy is not available; use generate_mock_data_native().", call. = FALSE) - } - - if (!is.null(seed)) { - set.seed(seed) - } - - sim_def <- as_simstudy_def(spec) - simstudy::genData(n, sim_def$data_def) -} - -generate_mock_data_native <- function(spec, n, seed = NULL) { - validate_formula_referents(spec) - spec <- order_spec_by_dependencies(spec) - - if (!is.null(seed)) { - set.seed(seed) - } - - data <- data.frame(id = seq_len(n)) - - for (var in spec) { - if (var$type == "continuous") { - if (identical(var$distribution, "normal")) { - data[[var$name]] <- mockdata_rtrunc_norm( - n, - min = var$range[[1]], - max = var$range[[2]], - mu = var$mean, - s = var$sd - ) - } else if (var$rtype == "integer") { - data[[var$name]] <- sample(seq(var$range[[1]], var$range[[2]]), n, replace = TRUE) - } else { - data[[var$name]] <- stats::runif(n, var$range[[1]], var$range[[2]]) - } - } else if (var$type == "categorical") { - data[[var$name]] <- sample( - var$levels, - n, - replace = TRUE, - prob = var$proportions - ) - } else if (var$type == "date") { - days <- as.integer(var$range[[2]] - var$range[[1]]) - data[[paste0(var$name, "__offset")]] <- sample(0:days, n, replace = TRUE) - } else if (var$type == "binary_formula") { - linear_predictor <- eval(str2expression(var$formula), envir = data, enclos = parent.frame()) - data[[var$name]] <- stats::rbinom(n, size = 1, prob = stats::plogis(linear_predictor)) - } - } - - data -} - -generate_correlated_simstudy <- function(sim_def, n, seed = NULL) { - if (!simstudy_available) { - stop("simstudy is not available; use generate_correlated_native().", call. = FALSE) - } - - if (!is.null(seed)) { - set.seed(seed) - } - - group <- sim_def$correlation_groups[[1]] - as.data.frame(simstudy::genCorData( - n, - mu = group$means, - sigma = group$sds, - rho = group$rho, - corstr = group$corstr, - cnames = group$names - )) -} - -generate_correlated_native <- function(sim_def, n, seed = NULL) { - if (!is.null(seed)) { - set.seed(seed) - } - - group <- sim_def$correlation_groups[[1]] - n_vars <- length(group$names) - cor_matrix <- matrix(group$rho, nrow = n_vars, ncol = n_vars) - diag(cor_matrix) <- 1 - z <- matrix(stats::rnorm(n * n_vars), nrow = n) - values <- z %*% chol(cor_matrix) - values <- sweep(values, 2, group$sds, `*`) - values <- sweep(values, 2, group$means, `+`) - out <- as.data.frame(values) - names(out) <- group$names - out$id <- seq_len(n) - out[c("id", group$names)] -} - -inject_missing_codes <- function(values, - missing_codes, - missing_proportions, - seed = NULL, - return_assignment = FALSE) { - assignment <- rep("valid", length(values)) - - if (length(missing_codes) == 0 || sum(missing_proportions, na.rm = TRUE) <= 0) { - if (return_assignment) { - return(list(values = values, assignment = assignment)) - } - return(values) - } - - if (!is.null(seed)) { - set.seed(seed) - } - - missing_proportions[is.na(missing_proportions)] <- 0 - valid_prop <- max(0, 1 - sum(missing_proportions)) - assignment <- sample( - c("valid", names(missing_proportions)), - length(values), - replace = TRUE, - prob = c(valid_prop, missing_proportions) - ) - - values <- apply_missing_codes(values, assignment, as.list(missing_codes)) - - if (return_assignment) { - return(list(values = values, assignment = assignment)) - } - - values -} - -coerce_mock_rtype <- function(values, rtype) { - switch( - rtype, - integer = as.integer(round(as.numeric(values))), - numeric = as.numeric(values), - double = as.double(values), - factor = factor(values), - character = as.character(values), - date = as.Date(values), - values - ) -} - -postprocess_mock_data <- function(data, spec, seed = NULL) { - data <- as.data.frame(data) - diagnostics <- list(missing_assignments = list()) - - for (var in spec) { - if (var$type == "date") { - offset_name <- paste0(var$name, "__offset") - data[[var$name]] <- var$range[[1]] + data[[offset_name]] - data[[offset_name]] <- NULL - } - - if (!var$name %in% names(data)) { - next - } - - missing_result <- inject_missing_codes( - data[[var$name]], - missing_codes = var$missing_codes, - missing_proportions = var$missing_proportions, - seed = seed, - return_assignment = TRUE - ) - data[[var$name]] <- missing_result$values - diagnostics$missing_assignments[[var$name]] <- missing_result$assignment - - if (length(var$garbage) > 0) { - var_row <- as.data.frame(var$garbage, stringsAsFactors = FALSE) - data[[var$name]] <- apply_garbage( - data[[var$name]], - var_row = var_row, - variable_type = var$rtype, - missing_codes = unname(unlist(var$missing_codes, use.names = FALSE)), - seed = seed - ) - } - - data[[var$name]] <- coerce_mock_rtype(data[[var$name]], var$rtype) - } - - attr(data, "mockdata_diagnostics") <- diagnostics - data -} - -example_recodeflow_metadata <- function() { - variables <- data.frame( - variable = c("age", "smoking", "interview_date"), - variableType = c("continuous", "categorical", "date"), - rType = c("integer", "integer", "date"), - distribution = c("normal", NA, "uniform"), - mean = c(50, NA, NA), - sd = c(15, NA, NA), - garbage_low_prop = c(0.02, NA, NA), - garbage_low_range = c("[0,17]", NA, NA), - garbage_high_prop = c(0.02, NA, NA), - garbage_high_range = c("[101,115]", NA, NA), - stringsAsFactors = FALSE - ) - - variable_details <- data.frame( - variable = c( - "age", "age", "age", - "smoking", "smoking", "smoking", "smoking", - "interview_date" - ), - databaseStart = "minimal-example", - recStart = c( - "[18,100]", "997", "998", - "1", "2", "3", "7", - "[2001-01-01;2005-12-31]" - ), - recEnd = c( - "valid", "NA::b", "NA::b", - "valid", "valid", "valid", "NA::b", - "valid" - ), - proportion = c( - NA, 0.02, 0.01, - 0.50, 0.30, 0.17, 0.03, - NA - ), - catLabel = c( - NA, "don't know", "refused", - "never", "former", "current", "don't know", - NA - ), - stringsAsFactors = FALSE - ) - - list(variables = variables, variable_details = variable_details) -} - -run_correlated_height_weight <- function(n = 1000, - seed = 123, - backend = c("simstudy", "native")) { - backend <- match.arg(backend) - correlation_spec <- new_mock_spec(list( - height_cm = mock_spec_correlated_continuous( - "height_cm", - mean = 170, - sd = 10, - correlation_group = "body_size" - ), - weight_kg = mock_spec_correlated_continuous( - "weight_kg", - mean = 78, - sd = 16, - correlation_group = "body_size" - ) - ), - provenance = list(adapter = "direct", source = "correlation prototype"), - correlation_groups = list(body_size = list(rho = 0.65, corstr = "cs"))) - - sim_def <- as_simstudy_def(correlation_spec) - out <- if (backend == "simstudy") { - generate_correlated_simstudy(sim_def, n = n, seed = seed) - } else { - generate_correlated_native(sim_def, n = n, seed = seed) - } - - list(spec = correlation_spec, simstudy_def = sim_def, data = out) -} - -run_survival_anchor <- function(n = 1000, seed = 123) { - if (!simstudy_available) { - stop("simstudy is not available; survival generation is an advanced backend test.", call. = FALSE) - } - - set.seed(seed) - base_def <- simstudy::defData(varname = "exposed", formula = 0.40, dist = "binary") - surv_def <- simstudy::defSurv( - varname = "event_time", - formula = "0.3 * exposed", - scale = 600, - shape = 1 - ) - surv_def <- simstudy::defSurv(surv_def, varname = "censor_time", scale = 1500, shape = 1) - - data <- simstudy::genData(n, base_def) - data <- simstudy::genSurv( - data, - surv_def, - timeName = "followup_days", - censorName = "censor_time", - eventName = "event" - ) - - data <- as.data.frame(data) - entry_start <- as.Date("2001-01-01") - data$entry_date <- entry_start + sample(0:365, n, replace = TRUE) - data$event_date <- as.Date(NA) - data$censor_date <- as.Date(NA) - event_idx <- which(data$event == 1) - censor_idx <- which(data$event == 0) - data$event_date[event_idx] <- data$entry_date[event_idx] + round(data$followup_days[event_idx]) - data$censor_date[censor_idx] <- data$entry_date[censor_idx] + round(data$followup_days[censor_idx]) - data -} - -run_spike <- function(n = 1000, seed = 123, backend = c("simstudy", "native")) { - backend <- match.arg(backend) - - metadata <- example_recodeflow_metadata() - spec <- as_mock_spec_from_recodeflow( - metadata$variables, - metadata$variable_details, - databaseStart = "minimal-example" - ) - spec[["high_visits"]] <- mock_spec_binary_formula( - "high_visits", - "-4 + 0.04 * age + 0.8 * (smoking == 3)" - ) - - baseline <- if (backend == "simstudy") { - generate_mock_data_simstudy(spec, n = n, seed = seed) - } else { - generate_mock_data_native(spec, n = n, seed = seed) - } - final <- postprocess_mock_data(baseline, spec, seed = seed + 1) - correlated <- if (backend == "simstudy") run_correlated_height_weight(n, seed = seed) else NULL - survival <- if (backend == "simstudy") run_survival_anchor(n, seed = seed) else NULL - - list( - backend = backend, - spec = spec, - spec_table = spec_table(spec), - simstudy_def = if (backend == "simstudy") as_simstudy_def(spec) else NULL, - baseline = as.data.frame(baseline), - final = final, - correlated = correlated, - survival = survival - ) -} - -assert_spike <- function(result) { - spec <- result$spec - baseline <- result$baseline - final <- result$final - diagnostics <- attr(final, "mockdata_diagnostics") - - stopifnot(identical(attr(spec, "spec_version"), "0.4-spike-1")) - stopifnot(identical(attr(spec, "model_hint"), "hybrid")) - stopifnot(isTRUE(all.equal(as.numeric(spec$age$range), c(18, 100)))) - stopifnot(identical(spec$smoking$levels, c("1", "2", "3"))) - if (result$backend == "simstudy") { - stopifnot(inherits(result$simstudy_def, "mock_simstudy_def")) - stopifnot("data.table" %in% class(result$simstudy_def$data_def)) - } - stopifnot(all(c("age", "smoking", "interview_date", "high_visits") %in% names(final))) - stopifnot(all(baseline$age >= 18 & baseline$age <= 100)) - stopifnot(abs(mean(baseline$age) - 50) < 2) - stopifnot(abs(stats::sd(baseline$age) - 15) < 3) - stopifnot(is.integer(final$age)) - stopifnot(all(baseline$smoking %in% c(1, 2, 3))) - smoking_props <- prop.table(table(baseline$smoking)) - stopifnot(abs(unname(smoking_props["1"]) - 0.50) < 0.07) - stopifnot(abs(unname(smoking_props["2"]) - 0.30) < 0.07) - stopifnot(abs(unname(smoking_props["3"]) - 0.20) < 0.07) - stopifnot(all(final$high_visits %in% c(0, 1))) - stopifnot(any(final$high_visits == 1)) - stopifnot(inherits(final$interview_date, "Date")) - stopifnot(all(final$interview_date >= as.Date("2001-01-01"))) - stopifnot(all(final$interview_date <= as.Date("2005-12-31"))) - stopifnot(any(final$age %in% c(997L, 998L))) - stopifnot(any(final$age < 18L | final$age > 100L)) - age_valid_assignment <- diagnostics$missing_assignments$age == "valid" - age_garbage_rate <- mean( - (final$age < 18L | final$age > 100L) & age_valid_assignment, - na.rm = TRUE - ) - stopifnot(age_garbage_rate > 0.02) - stopifnot(age_garbage_rate < 0.06) - stopifnot(any(final$smoking == 7L)) - stopifnot(any(diagnostics$missing_assignments$smoking == "7")) - stopifnot(abs(mean(diagnostics$missing_assignments$smoking == "7") - 0.03) < 0.03) - - if (result$backend == "simstudy") { - correlated <- result$correlated$data - correlation_spec <- result$correlated$spec - survival <- result$survival - - stopifnot(all(vapply(correlation_spec, `[[`, character(1), "correlation_group") == "body_size")) - stopifnot(inherits(result$correlated$simstudy_def, "mock_simstudy_def")) - stopifnot(abs(stats::cor(correlated$height_cm, correlated$weight_kg) - 0.65) < 0.08) - stopifnot(abs(mean(correlated$height_cm) - 170) < 2) - stopifnot(abs(mean(correlated$weight_kg) - 78) < 3) - stopifnot(abs(stats::sd(correlated$height_cm) - 10) < 2) - stopifnot(abs(stats::sd(correlated$weight_kg) - 16) < 3) - stopifnot(all(survival$followup_days >= 0)) - stopifnot(any(survival$event == 1)) - stopifnot(all(is.na(survival$event_date) | survival$event_date >= survival$entry_date)) - stopifnot(all(is.na(survival$censor_date) | survival$censor_date >= survival$entry_date)) - stopifnot(!any(!is.na(survival$event_date) & !is.na(survival$censor_date))) - } - - invisible(TRUE) -} - -assert_native_fallback <- function(n = 1000, seed = 123) { - native_result <- run_spike(n = n, seed = seed, backend = "native") - final <- native_result$final - - stopifnot(is.null(native_result$simstudy_def)) - stopifnot(all(c("age", "smoking", "interview_date", "high_visits") %in% names(final))) - stopifnot(is.integer(final$age)) - stopifnot(inherits(final$interview_date, "Date")) - stopifnot(any(final$age %in% c(997L, 998L))) - stopifnot(any(final$smoking == 7L)) - - invisible(native_result) -} - -assert_missing_collision_case <- function(seed = 123) { - spec <- new_mock_spec(list( - collision_code = mock_spec_categorical( - name = "collision_code", - levels = c("1", "2", "97", "99"), - proportions = c(0.20, 0.20, 0.50, 0.10), - rtype = "integer", - missing_codes = c("97" = "97"), - missing_proportions = c("97" = 0.10), - provenance = "collision-test", - model_hint = "diagnostic-required" - ) - )) - - baseline <- generate_mock_data_native(spec, n = 1000, seed = seed) - final <- postprocess_mock_data(baseline, spec, seed = seed + 1) - assignment <- attr(final, "mockdata_diagnostics")$missing_assignments$collision_code - - stopifnot(any(baseline$collision_code == "97")) - stopifnot(any(final$collision_code == 97L & assignment == "valid")) - stopifnot(any(final$collision_code == 97L & assignment == "97")) - - invisible(final) -} - -assert_non_numeric_categorical_labels <- function(seed = 123) { - spec <- new_mock_spec(list( - smoking_label = mock_spec_categorical( - name = "smoking_label", - levels = c("never", "former", "current"), - proportions = c(0.50, 0.30, 0.20), - rtype = "character", - provenance = "label-test", - model_hint = "simstudy-or-native" - ) - )) - - native <- postprocess_mock_data( - generate_mock_data_native(spec, n = 1000, seed = seed), - spec, - seed = seed + 1 - ) - stopifnot(all(native$smoking_label %in% c("never", "former", "current"))) - - if (simstudy_available) { - sim <- postprocess_mock_data( - generate_mock_data_simstudy(spec, n = 1000, seed = seed), - spec, - seed = seed + 1 - ) - stopifnot(all(sim$smoking_label %in% c("never", "former", "current"))) - } - - invisible(TRUE) -} - -assert_formula_dependency_validation <- function() { - spec <- new_mock_spec(list( - outcome = mock_spec_binary_formula("outcome", "-1 + missing_predictor") - )) - - error <- tryCatch( - { - validate_formula_referents(spec) - NULL - }, - error = conditionMessage - ) - stopifnot(grepl("missing_predictor", error)) - - unordered <- new_mock_spec(list( - outcome = mock_spec_binary_formula("outcome", "-4 + 0.04 * age"), - age = mock_spec_continuous( - "age", - range = c(18, 100), - distribution = "normal", - mean = 50, - sd = 15, - rtype = "integer" - ) - )) - ordered <- order_spec_by_dependencies(unordered) - stopifnot(identical(names(ordered), c("age", "outcome"))) - - invisible(TRUE) -} - -assert_truncated_normal_boundaries <- function() { - error <- tryCatch( - { - mockdata_rtrunc_norm(10, min = 5, max = 5, mu = 5, s = 1) - NULL - }, - error = conditionMessage - ) - stopifnot(grepl("min < max", error)) - - invisible(TRUE) -} - -assert_seed_reproducibility <- function() { - first <- run_spike(seed = 123, backend = "native")$final - second <- run_spike(seed = 123, backend = "native")$final - stopifnot(identical(first, second)) - - if (simstudy_available) { - first_simstudy <- run_spike(seed = 123, backend = "simstudy")$final - second_simstudy <- run_spike(seed = 123, backend = "simstudy")$final - stopifnot(identical(first_simstudy, second_simstudy)) - } - - invisible(TRUE) -} - -from_linkml <- function(...) { - stop( - "from_linkml() is a forward-compatibility placeholder for a future ", - "third input adapter; it is not implemented in this spike.", - call. = FALSE - ) -} - -spike_result <- if (simstudy_available) { - run_spike(backend = "simstudy") -} else { - message("simstudy is not available; running native fallback assertions only.") - run_spike(backend = "native") -} -assert_spike(spike_result) -native_result <- assert_native_fallback() -collision_result <- assert_missing_collision_case() -assert_non_numeric_categorical_labels() -assert_formula_dependency_validation() -assert_truncated_normal_boundaries() -assert_seed_reproducibility() - -cat("MockData v0.4 simstudy spike passed.\n\n") -print(spike_result$spec_table) -cat("\nGenerated data preview:\n") -print(utils::head(spike_result$final)) -if (simstudy_available) { - cat("\nCorrelated height/weight correlation:\n") - print(stats::cor( - spike_result$correlated$data$height_cm, - spike_result$correlated$data$weight_kg - )) - cat("\nSurvival preview:\n") - print(utils::head(spike_result$survival[c("id", "exposed", "followup_days", "event", "entry_date", "event_date", "censor_date")])) -} -cat("\nNative fallback preview:\n") -print(utils::head(native_result$final)) -cat("\nMissing-code collision preview:\n") -print(utils::head(collision_result)) diff --git a/tests/testthat/test-mock-spec.R b/tests/testthat/test-mock-spec.R new file mode 100644 index 0000000..bc8de92 --- /dev/null +++ b/tests/testthat/test-mock-spec.R @@ -0,0 +1,143 @@ +test_that("mock_spec creates an empty specification", { + spec <- mock_spec() + + expect_s3_class(spec, "mock_spec") + expect_true(is_mock_spec(spec)) + expect_equal(spec$spec_version, "0.4.0") + expect_equal(length(spec$variables), 0) + expect_true(validate_mock_spec(spec)$valid) +}) + +test_that("mock_spec accepts NULL as an empty specification", { + spec <- mock_spec(NULL) + + expect_s3_class(spec, "mock_spec") + expect_equal(length(spec$variables), 0) + expect_true(validate_mock_spec(spec, n = 0)$valid) +}) + +test_that("mock_spec supports single continuous variable specs", { + age <- mock_spec_continuous( + name = "age", + range = c(18, 85), + distribution = "normal", + mean = 50, + sd = 12, + rtype = "integer", + missing_codes = c(997, 998), + missing_proportions = c(0.02, 0.01) + ) + spec <- mock_spec(age) + + expect_s3_class(age, "mock_spec_variable") + expect_named(spec$variables, "age") + expect_equal(spec$variables$age$type, "continuous") + expect_equal(spec$variables$age$range, c(18, 85)) + expect_equal(spec$variables$age$provenance$adapter, "direct") + expect_true(validate_mock_spec(spec, n = 1)$valid) +}) + +test_that("mock_spec supports categorical variable specs", { + smoking <- mock_spec_categorical( + name = "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + rtype = "character" + ) + spec <- mock_spec(list(smoking)) + + expect_named(spec$variables, "smoking") + expect_equal(spec$variables$smoking$type, "categorical") + expect_equal(spec$variables$smoking$levels, c("never", "former", "current")) + expect_equal(spec$variables$smoking$proportions, c(0.5, 0.3, 0.2)) + expect_true(validate_mock_spec(spec)$valid) +}) + +test_that("mock_spec supports date variable specs", { + interview_date <- mock_spec_date( + name = "interview_date", + range = as.Date(c("2001-01-01", "2005-12-31")), + source_format = "analysis" + ) + spec <- mock_spec(interview_date) + + expect_named(spec$variables, "interview_date") + expect_equal(spec$variables$interview_date$type, "date") + expect_s3_class(spec$variables$interview_date$range, "Date") + expect_equal(spec$variables$interview_date$model_hint, "native-postprocess") + expect_true(validate_mock_spec(spec, n = 0)$valid) +}) + +test_that("mock_spec validates n as a non-negative whole number", { + spec <- mock_spec() + + expect_true(validate_mock_spec(spec, n = 0)$valid) + expect_error(validate_mock_spec(spec, n = -1), "non-negative whole number") + expect_error(validate_mock_spec(spec, n = 1.5), "non-negative whole number") + expect_error(validate_mock_spec(spec, n = NA_real_), "non-negative whole number") +}) + +test_that("validate_mock_spec returns structured errors when strict is FALSE", { + spec <- mock_spec(mock_spec_categorical( + name = "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3) + )) + + result <- validate_mock_spec(spec, strict = FALSE) + + expect_false(result$valid) + expect_true(any(grepl("one proportion per level", result$errors))) +}) + +test_that("validate_mock_spec catches malformed continuous and date ranges", { + bad_continuous <- mock_spec(mock_spec_continuous( + name = "age", + range = c(85, 18) + )) + expect_error(validate_mock_spec(bad_continuous), "lower bound") + + bad_date <- mock_spec(mock_spec_date( + name = "interview_date", + range = c("2001-01-01", "2005-12-31") + )) + expect_error(validate_mock_spec(bad_date), "range must be Date") +}) + +test_that("validate_mock_spec catches normal distribution parameter errors", { + bad_normal <- mock_spec(mock_spec_continuous( + name = "age", + range = c(18, 85), + distribution = "normal", + mean = 50, + sd = 0 + )) + + expect_error(validate_mock_spec(bad_normal), "sd > 0") +}) + +test_that("mock_spec rejects duplicate variable names", { + spec <- mock_spec( + mock_spec_continuous("age", range = c(18, 85)), + mock_spec_continuous("age", range = c(0, 100)) + ) + + result <- validate_mock_spec(spec, strict = FALSE) + + expect_false(result$valid) + expect_true(any(grepl("unique", result$errors))) +}) + +test_that("mock_spec validates model hints", { + expect_error( + mock_spec_continuous("age", range = c(18, 85), model_hint = "magic"), + "model_hint" + ) +}) + +test_that("validate_mock_spec rejects non-spec objects", { + result <- validate_mock_spec(list(), strict = FALSE) + + expect_false(result$valid) + expect_true(any(grepl("mock_spec", result$errors))) +}) From 75484a3f6a001b59559241d4adf9e8fd48f897cc Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 07:04:04 -0400 Subject: [PATCH 02/18] Address mock_spec milestone review --- NAMESPACE | 1 + NEWS.md | 12 ++++ R/mock_spec.R | 118 +++++++++++++++++++++++++++++++- tests/testthat/test-mock-spec.R | 66 ++++++++++++++++-- 4 files changed, 190 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 33c9e9a..0c73a0a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method(print,mock_spec_validation_result) S3method(print,mockdata_validation_result) export(add_garbage) export(apply_garbage) diff --git a/NEWS.md b/NEWS.md index 3c5a00a..b67598b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# MockData 0.4.0 + +## Development + +- Started the v0.4 production refactor around a normalized `mock_spec` + architecture. +- Added `mock_spec()`, `mock_spec_continuous()`, `mock_spec_categorical()`, + `mock_spec_date()`, `is_mock_spec()`, and `validate_mock_spec()`. +- Added forward-compatible specification fields: `spec_version`, `provenance`, + and `model_hint`. +- Existing v0.3 generator APIs remain available while v0.4 internals are built. + # MockData 0.3.0 ## Breaking changes diff --git a/R/mock_spec.R b/R/mock_spec.R index 69b8a0a..96ed932 100644 --- a/R/mock_spec.R +++ b/R/mock_spec.R @@ -17,6 +17,29 @@ "diagnostic-required" ) +#' Model hints for MockData specifications +#' +#' Model hints are lightweight backend guidance carried by `mock_spec` objects +#' and variables. They are not generation commands; generation backends may use +#' them to choose a sensible default path. +#' +#' Supported values: +#' \describe{ +#' \item{`auto`}{Let MockData choose the backend.} +#' \item{`native`}{Prefer the native MockData backend.} +#' \item{`simstudy`}{Prefer the optional `simstudy` backend.} +#' \item{`native-postprocess`}{Generate baseline values natively, then rely +#' on MockData post-processing such as date/source-format conversion.} +#' \item{`simstudy-or-native`}{Either backend is expected to be suitable.} +#' \item{`simstudy-advanced`}{Feature is expected to need advanced `simstudy` +#' support, such as correlations or survival durations.} +#' \item{`diagnostic-required`}{Generation/post-processing must preserve +#' diagnostics needed to interpret the result.} +#' } +#' +#' @name mock_spec_model_hints +NULL + `%||%` <- function(x, y) { if (is.null(x)) y else x } @@ -135,16 +158,31 @@ #' @param provenance List or character describing where the spec came from. #' @param model_hint Character backend hint. One of the supported MockData model #' hints. +#' @param validate Logical. If `TRUE`, validate the constructed specification +#' before returning it. #' #' @return S3 object of class `mock_spec`. +#' +#' @examples +#' spec <- mock_spec( +#' mock_spec_continuous("age", range = c(18, 85), rtype = "integer"), +#' mock_spec_categorical( +#' "smoking", +#' levels = c("never", "former", "current"), +#' proportions = c(0.5, 0.3, 0.2) +#' ) +#' ) +#' validate_mock_spec(spec) +#' #' @export mock_spec <- function(..., spec_version = .mock_spec_version, provenance = list(adapter = "direct", source = "direct"), - model_hint = "auto") { + model_hint = "auto", + validate = TRUE) { .validate_model_hint(model_hint) - structure( + spec <- structure( list( spec_version = spec_version, provenance = .normalize_provenance(provenance), @@ -153,6 +191,12 @@ mock_spec <- function(..., ), class = c("mock_spec", "list") ) + + if (isTRUE(validate)) { + validate_mock_spec(spec, strict = TRUE) + } + + spec } #' Create a continuous variable specification @@ -170,6 +214,17 @@ mock_spec <- function(..., #' @param model_hint Backend hint. #' #' @return A `mock_spec_variable` object. +#' +#' @examples +#' age <- mock_spec_continuous( +#' "age", +#' range = c(18, 85), +#' distribution = "normal", +#' mean = 50, +#' sd = 12, +#' rtype = "integer" +#' ) +#' #' @export mock_spec_continuous <- function(name, range, @@ -212,6 +267,15 @@ mock_spec_continuous <- function(name, #' @param model_hint Backend hint. #' #' @return A `mock_spec_variable` object. +#' +#' @examples +#' smoking <- mock_spec_categorical( +#' "smoking", +#' levels = c("never", "former", "current"), +#' proportions = c(0.5, 0.3, 0.2), +#' rtype = "character" +#' ) +#' #' @export mock_spec_categorical <- function(name, levels, @@ -251,6 +315,13 @@ mock_spec_categorical <- function(name, #' @param model_hint Backend hint. #' #' @return A `mock_spec_variable` object. +#' +#' @examples +#' interview_date <- mock_spec_date( +#' "interview_date", +#' range = as.Date(c("2001-01-01", "2005-12-31")) +#' ) +#' #' @export mock_spec_date <- function(name, range, @@ -281,6 +352,12 @@ mock_spec_date <- function(name, #' @param x Object to check. #' #' @return Logical scalar. +#' +#' @examples +#' spec <- mock_spec() +#' is_mock_spec(spec) +#' is_mock_spec(list()) +#' #' @export is_mock_spec <- function(x) { inherits(x, "mock_spec") @@ -301,6 +378,35 @@ is_mock_spec <- function(x) { ) } +#' @export +print.mock_spec_validation_result <- function(x, ...) { + status <- if (isTRUE(x$valid)) "valid" else "invalid" + cat("MockData mock_spec validation result: ", status, "\n", sep = "") + + if (length(x$errors) > 0) { + cat("\nErrors:\n") + for (i in seq_along(x$errors)) { + cat(i, ". ", x$errors[[i]], "\n", sep = "") + } + } + + if (length(x$warnings) > 0) { + cat("\nWarnings:\n") + for (i in seq_along(x$warnings)) { + cat(i, ". ", x$warnings[[i]], "\n", sep = "") + } + } + + if (length(x$info) > 0) { + cat("\nInfo:\n") + for (i in seq_along(x$info)) { + cat(i, ". ", x$info[[i]], "\n", sep = "") + } + } + + invisible(x) +} + .validate_probability_vector <- function(values, label, allow_null = FALSE) { errors <- character(0) @@ -435,6 +541,14 @@ is_mock_spec <- function(x) { #' a validation result object is returned. #' #' @return A `mock_spec_validation_result` object when valid or `strict = FALSE`. +#' +#' @examples +#' spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85))) +#' validate_mock_spec(spec) +#' +#' result <- validate_mock_spec(list(), strict = FALSE) +#' result$valid +#' #' @export validate_mock_spec <- function(spec, n = NULL, strict = TRUE) { errors <- character(0) diff --git a/tests/testthat/test-mock-spec.R b/tests/testthat/test-mock-spec.R index bc8de92..edc3f41 100644 --- a/tests/testthat/test-mock-spec.R +++ b/tests/testthat/test-mock-spec.R @@ -82,7 +82,7 @@ test_that("validate_mock_spec returns structured errors when strict is FALSE", { name = "smoking", levels = c("never", "former", "current"), proportions = c(0.5, 0.3) - )) + ), validate = FALSE) result <- validate_mock_spec(spec, strict = FALSE) @@ -94,13 +94,13 @@ test_that("validate_mock_spec catches malformed continuous and date ranges", { bad_continuous <- mock_spec(mock_spec_continuous( name = "age", range = c(85, 18) - )) + ), validate = FALSE) expect_error(validate_mock_spec(bad_continuous), "lower bound") bad_date <- mock_spec(mock_spec_date( name = "interview_date", range = c("2001-01-01", "2005-12-31") - )) + ), validate = FALSE) expect_error(validate_mock_spec(bad_date), "range must be Date") }) @@ -111,7 +111,7 @@ test_that("validate_mock_spec catches normal distribution parameter errors", { distribution = "normal", mean = 50, sd = 0 - )) + ), validate = FALSE) expect_error(validate_mock_spec(bad_normal), "sd > 0") }) @@ -119,7 +119,8 @@ test_that("validate_mock_spec catches normal distribution parameter errors", { test_that("mock_spec rejects duplicate variable names", { spec <- mock_spec( mock_spec_continuous("age", range = c(18, 85)), - mock_spec_continuous("age", range = c(0, 100)) + mock_spec_continuous("age", range = c(0, 100)), + validate = FALSE ) result <- validate_mock_spec(spec, strict = FALSE) @@ -141,3 +142,58 @@ test_that("validate_mock_spec rejects non-spec objects", { expect_false(result$valid) expect_true(any(grepl("mock_spec", result$errors))) }) + +test_that("mock_spec validates on construction by default", { + expect_error( + mock_spec(mock_spec_categorical( + name = "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3) + )), + "one proportion per level" + ) +}) + +test_that("validate_mock_spec accumulates multiple errors", { + spec <- mock_spec( + mock_spec_categorical( + name = "smoking", + levels = character(0), + proportions = c(0.5, 0.5) + ), + validate = FALSE + ) + + result <- validate_mock_spec(spec, strict = FALSE) + + expect_false(result$valid) + expect_true(length(result$errors) >= 2) + expect_true(any(grepl("at least one level", result$errors))) + expect_true(any(grepl("one proportion per level", result$errors))) +}) + +test_that("is_mock_spec returns FALSE for non-spec objects", { + expect_false(is_mock_spec(list())) + expect_false(is_mock_spec(NULL)) + expect_false(is_mock_spec(data.frame())) +}) + +test_that("mock_spec preserves spec_version and rejects missing spec_version", { + spec <- mock_spec(spec_version = "0.4.0-test") + + expect_equal(spec$spec_version, "0.4.0-test") + + spec$spec_version <- NA_character_ + result <- validate_mock_spec(spec, strict = FALSE) + + expect_false(result$valid) + expect_true(any(grepl("spec_version", result$errors))) +}) + +test_that("print.mock_spec_validation_result summarizes errors", { + result <- validate_mock_spec(list(), strict = FALSE) + + expect_output(print(result), "invalid") + expect_output(print(result), "Errors") + expect_output(print(result), "spec must be a mock_spec object") +}) From 33d7aee57323aa4d416b3bff28c1de50fe3fa95e Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 07:11:08 -0400 Subject: [PATCH 03/18] Add direct mock specification helpers --- NAMESPACE | 3 + NEWS.md | 3 + R/mock_spec.R | 191 ++++++++++++++++++++++++++ tests/testthat/test-direct-mock-api.R | 81 +++++++++++ 4 files changed, 278 insertions(+) create mode 100644 tests/testthat/test-direct-mock-api.R diff --git a/NAMESPACE b/NAMESPACE index 0c73a0a..cb0f2e3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,9 @@ export(identify_derived_vars) export(import_from_recodeflow) export(is_mock_spec) export(make_garbage) +export(mock_categorical) +export(mock_continuous) +export(mock_date) export(mock_spec) export(mock_spec_categorical) export(mock_spec_continuous) diff --git a/NEWS.md b/NEWS.md index b67598b..80afcd5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,9 @@ architecture. - Added `mock_spec()`, `mock_spec_continuous()`, `mock_spec_categorical()`, `mock_spec_date()`, `is_mock_spec()`, and `validate_mock_spec()`. +- Added direct specification helpers `mock_continuous()`, + `mock_categorical()`, and `mock_date()` for simple use without + recodeflow-style metadata tables. - Added forward-compatible specification fields: `spec_version`, `provenance`, and `model_hint`. - Existing v0.3 generator APIs remain available while v0.4 internals are built. diff --git a/R/mock_spec.R b/R/mock_spec.R index 96ed932..928a5a2 100644 --- a/R/mock_spec.R +++ b/R/mock_spec.R @@ -146,6 +146,13 @@ NULL variables } +.direct_api_provenance <- function(source, provenance = NULL) { + .normalize_provenance( + provenance %||% list(adapter = "direct", source = source), + source = source + ) +} + #' Create a MockData specification #' #' `mock_spec()` creates the normalized v0.4 specification object used by the @@ -199,6 +206,190 @@ mock_spec <- function(..., spec } +#' Create a direct continuous mock-data specification +#' +#' `mock_continuous()` is the simple direct API for continuous variables. It +#' returns a validated `mock_spec`; it does not generate data. Generation +#' backends will consume this specification in a later v0.4 milestone. +#' +#' @param name Variable name. +#' @param range Numeric vector of length two giving the inclusive valid range. +#' @param distribution Distribution name. Defaults to `"uniform"`. +#' @param mean,sd Optional distribution parameters. Required when +#' `distribution = "normal"`. +#' @param rtype R output type. Defaults to `"double"`. +#' @param missing_codes Explicit missing-code values. +#' @param missing_proportions Missing-code probabilities aligned to +#' `missing_codes`. +#' @param garbage_rules List of intentional invalid-value rules. +#' @param provenance Optional provenance metadata. Defaults to the direct API. +#' @param model_hint Backend hint. +#' @param spec_version Character version of the specification shape. +#' +#' @return A validated `mock_spec` object containing one continuous variable. +#' +#' @examples +#' age_spec <- mock_continuous( +#' "age", +#' range = c(18, 85), +#' distribution = "normal", +#' mean = 50, +#' sd = 12, +#' rtype = "integer" +#' ) +#' validate_mock_spec(age_spec) +#' +#' @export +mock_continuous <- function(name, + range, + distribution = "uniform", + mean = NA_real_, + sd = NA_real_, + rtype = "double", + missing_codes = numeric(0), + missing_proportions = numeric(0), + garbage_rules = list(), + provenance = NULL, + model_hint = "auto", + spec_version = .mock_spec_version) { + provenance <- .direct_api_provenance("mock_continuous", provenance) + + mock_spec( + mock_spec_continuous( + name = name, + range = range, + distribution = distribution, + mean = mean, + sd = sd, + rtype = rtype, + missing_codes = missing_codes, + missing_proportions = missing_proportions, + garbage_rules = garbage_rules, + provenance = provenance, + model_hint = model_hint + ), + spec_version = spec_version, + provenance = provenance, + model_hint = model_hint + ) +} + +#' Create a direct categorical mock-data specification +#' +#' `mock_categorical()` is the simple direct API for categorical variables. It +#' returns a validated `mock_spec`; it does not generate data. +#' +#' @param name Variable name. +#' @param levels Character vector of valid levels or codes. +#' @param proportions Optional probabilities aligned to `levels`. +#' @param rtype R output type. Defaults to `"factor"`. +#' @param missing_codes Explicit missing-code values. +#' @param missing_proportions Missing-code probabilities aligned to +#' `missing_codes`. +#' @param garbage_rules List of intentional invalid-value rules. +#' @param provenance Optional provenance metadata. Defaults to the direct API. +#' @param model_hint Backend hint. +#' @param spec_version Character version of the specification shape. +#' +#' @return A validated `mock_spec` object containing one categorical variable. +#' +#' @examples +#' smoking_spec <- mock_categorical( +#' "smoking", +#' levels = c("never", "former", "current"), +#' proportions = c(0.5, 0.3, 0.2), +#' rtype = "character" +#' ) +#' validate_mock_spec(smoking_spec) +#' +#' @export +mock_categorical <- function(name, + levels, + proportions = NULL, + rtype = "factor", + missing_codes = character(0), + missing_proportions = numeric(0), + garbage_rules = list(), + provenance = NULL, + model_hint = "auto", + spec_version = .mock_spec_version) { + provenance <- .direct_api_provenance("mock_categorical", provenance) + + mock_spec( + mock_spec_categorical( + name = name, + levels = levels, + proportions = proportions, + rtype = rtype, + missing_codes = missing_codes, + missing_proportions = missing_proportions, + garbage_rules = garbage_rules, + provenance = provenance, + model_hint = model_hint + ), + spec_version = spec_version, + provenance = provenance, + model_hint = model_hint + ) +} + +#' Create a direct date mock-data specification +#' +#' `mock_date()` is the simple direct API for date variables. It returns a +#' validated `mock_spec`; it does not generate data. +#' +#' @param name Variable name. +#' @param range Date vector of length two giving the inclusive valid date range. +#' @param rtype R output type. Defaults to `"date"`. +#' @param source_format Source-format hint. Defaults to `"analysis"`. +#' @param missing_codes Explicit missing-code values. +#' @param missing_proportions Missing-code probabilities aligned to +#' `missing_codes`. +#' @param garbage_rules List of intentional invalid-value rules. +#' @param provenance Optional provenance metadata. Defaults to the direct API. +#' @param model_hint Backend hint. +#' @param spec_version Character version of the specification shape. +#' +#' @return A validated `mock_spec` object containing one date variable. +#' +#' @examples +#' interview_date_spec <- mock_date( +#' "interview_date", +#' range = as.Date(c("2001-01-01", "2005-12-31")) +#' ) +#' validate_mock_spec(interview_date_spec) +#' +#' @export +mock_date <- function(name, + range, + rtype = "date", + source_format = "analysis", + missing_codes = character(0), + missing_proportions = numeric(0), + garbage_rules = list(), + provenance = NULL, + model_hint = "native-postprocess", + spec_version = .mock_spec_version) { + provenance <- .direct_api_provenance("mock_date", provenance) + + mock_spec( + mock_spec_date( + name = name, + range = range, + rtype = rtype, + source_format = source_format, + missing_codes = missing_codes, + missing_proportions = missing_proportions, + garbage_rules = garbage_rules, + provenance = provenance, + model_hint = model_hint + ), + spec_version = spec_version, + provenance = provenance, + model_hint = model_hint + ) +} + #' Create a continuous variable specification #' #' @param name Variable name. diff --git a/tests/testthat/test-direct-mock-api.R b/tests/testthat/test-direct-mock-api.R new file mode 100644 index 0000000..2248282 --- /dev/null +++ b/tests/testthat/test-direct-mock-api.R @@ -0,0 +1,81 @@ +test_that("mock_continuous creates a validated one-variable spec", { + spec <- mock_continuous( + "age", + range = c(18, 85), + distribution = "normal", + mean = 50, + sd = 12, + rtype = "integer", + missing_codes = c(997, 998), + missing_proportions = c(0.02, 0.01) + ) + + expect_s3_class(spec, "mock_spec") + expect_named(spec$variables, "age") + expect_equal(spec$variables$age$type, "continuous") + expect_equal(spec$variables$age$rtype, "integer") + expect_equal(spec$provenance$adapter, "direct") + expect_equal(spec$provenance$source, "mock_continuous") + expect_true(validate_mock_spec(spec)$valid) +}) + +test_that("mock_categorical creates a validated one-variable spec", { + spec <- mock_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + rtype = "character" + ) + + expect_s3_class(spec, "mock_spec") + expect_named(spec$variables, "smoking") + expect_equal(spec$variables$smoking$type, "categorical") + expect_equal(spec$variables$smoking$levels, c("never", "former", "current")) + expect_equal(spec$variables$smoking$proportions, c(0.5, 0.3, 0.2)) + expect_equal(spec$provenance$source, "mock_categorical") + expect_true(validate_mock_spec(spec)$valid) +}) + +test_that("mock_date creates a validated one-variable spec", { + spec <- mock_date( + "interview_date", + range = as.Date(c("2001-01-01", "2005-12-31")) + ) + + expect_s3_class(spec, "mock_spec") + expect_named(spec$variables, "interview_date") + expect_equal(spec$variables$interview_date$type, "date") + expect_s3_class(spec$variables$interview_date$range, "Date") + expect_equal(spec$model_hint, "native-postprocess") + expect_equal(spec$provenance$source, "mock_date") + expect_true(validate_mock_spec(spec)$valid) +}) + +test_that("direct mock APIs validate immediately", { + expect_error( + mock_continuous( + "age", + range = c(18, 85), + distribution = "normal", + mean = 50 + ), + "sd > 0" + ) + + expect_error( + mock_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3) + ), + "one proportion per level" + ) + + expect_error( + mock_date( + "interview_date", + range = c("2001-01-01", "2005-12-31") + ), + "range must be Date" + ) +}) From f29414defb3fd815b47e92dd6ac7b40cfbe45648 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 09:24:35 -0400 Subject: [PATCH 04/18] Tighten mock_spec auditability contracts --- R/mock_spec.R | 99 +++++++++++++++++++++++---- development/adr/v04-hybrid-backend.md | 7 +- tests/testthat/test-direct-mock-api.R | 73 ++++++++++++++++++++ tests/testthat/test-mock-spec.R | 53 ++++++++++++++ 4 files changed, 219 insertions(+), 13 deletions(-) diff --git a/R/mock_spec.R b/R/mock_spec.R index 928a5a2..7fd3e51 100644 --- a/R/mock_spec.R +++ b/R/mock_spec.R @@ -7,6 +7,8 @@ .mock_spec_version <- "0.4.0" +.mock_spec_probability_tolerance <- 1e-8 + .mock_spec_model_hints <- c( "auto", "native", @@ -44,6 +46,10 @@ NULL if (is.null(x)) y else x } +.is_non_empty_string <- function(x) { + is.character(x) && length(x) == 1 && !is.na(x) && nzchar(trimws(x)) +} + .normalize_provenance <- function(provenance, source = NULL) { if (is.null(provenance)) { provenance <- list(adapter = "direct", source = source %||% "direct") @@ -51,14 +57,22 @@ NULL provenance <- list(adapter = as.character(provenance), source = source %||% as.character(provenance)) } - if (is.null(provenance$adapter) || is.na(provenance$adapter) || provenance$adapter == "") { - provenance$adapter <- "unknown" + if (!.is_non_empty_string(provenance$adapter)) { + stop("provenance$adapter must be a non-empty string.", call. = FALSE) } - if (is.null(provenance$source) || is.na(provenance$source) || provenance$source == "") { + if (is.null(provenance$source) && !is.null(source)) { + provenance$source <- source + } else if (is.null(provenance$source)) { provenance$source <- provenance$adapter } + if (!.is_non_empty_string(provenance$source)) { + stop("provenance$source must be a non-empty string.", call. = FALSE) + } - provenance + c( + list(adapter = provenance$adapter, source = provenance$source), + provenance[setdiff(names(provenance), c("adapter", "source"))] + ) } .validate_model_hint <- function(model_hint) { @@ -147,10 +161,15 @@ NULL } .direct_api_provenance <- function(source, provenance = NULL) { - .normalize_provenance( - provenance %||% list(adapter = "direct", source = source), - source = source - ) + provenance <- provenance %||% list(source = source) + + if (!is.list(provenance)) { + provenance <- list(source = as.character(provenance)) + } + provenance$adapter <- "direct" + provenance$source <- provenance$source %||% source + + .normalize_provenance(provenance, source = source) } #' Create a MockData specification @@ -169,6 +188,8 @@ NULL #' before returning it. #' #' @return S3 object of class `mock_spec`. +#' @family mock specification APIs +#' @seealso [mock_continuous()], [mock_categorical()], [mock_date()] #' #' @examples #' spec <- mock_spec( @@ -227,6 +248,8 @@ mock_spec <- function(..., #' @param spec_version Character version of the specification shape. #' #' @return A validated `mock_spec` object containing one continuous variable. +#' @family direct specification APIs +#' @seealso [mock_spec()], [mock_spec_continuous()] #' #' @examples #' age_spec <- mock_continuous( @@ -292,6 +315,8 @@ mock_continuous <- function(name, #' @param spec_version Character version of the specification shape. #' #' @return A validated `mock_spec` object containing one categorical variable. +#' @family direct specification APIs +#' @seealso [mock_spec()], [mock_spec_categorical()] #' #' @examples #' smoking_spec <- mock_categorical( @@ -351,6 +376,8 @@ mock_categorical <- function(name, #' @param spec_version Character version of the specification shape. #' #' @return A validated `mock_spec` object containing one date variable. +#' @family direct specification APIs +#' @seealso [mock_spec()], [mock_spec_date()] #' #' @examples #' interview_date_spec <- mock_date( @@ -405,6 +432,8 @@ mock_date <- function(name, #' @param model_hint Backend hint. #' #' @return A `mock_spec_variable` object. +#' @family mock specification APIs +#' @seealso [mock_spec()], [mock_continuous()] #' #' @examples #' age <- mock_spec_continuous( @@ -458,6 +487,8 @@ mock_spec_continuous <- function(name, #' @param model_hint Backend hint. #' #' @return A `mock_spec_variable` object. +#' @family mock specification APIs +#' @seealso [mock_spec()], [mock_categorical()] #' #' @examples #' smoking <- mock_spec_categorical( @@ -506,6 +537,8 @@ mock_spec_categorical <- function(name, #' @param model_hint Backend hint. #' #' @return A `mock_spec_variable` object. +#' @family mock specification APIs +#' @seealso [mock_spec()], [mock_date()] #' #' @examples #' interview_date <- mock_spec_date( @@ -622,6 +655,22 @@ print.mock_spec_validation_result <- function(x, ...) { errors } +.validate_provenance <- function(provenance, label) { + errors <- character(0) + + if (!is.list(provenance)) { + return(paste0(label, " provenance must be a list.")) + } + if (!.is_non_empty_string(provenance$adapter)) { + errors <- c(errors, paste0(label, " provenance$adapter must be a non-empty string.")) + } + if (!.is_non_empty_string(provenance$source)) { + errors <- c(errors, paste0(label, " provenance$source must be a non-empty string.")) + } + + errors +} + .validate_missing_spec <- function(variable) { errors <- character(0) @@ -643,7 +692,7 @@ print.mock_spec_validation_result <- function(x, ...) { )) missing_sum <- sum(variable$missing_proportions, na.rm = TRUE) - if (missing_sum > 1) { + if (missing_sum > 1 + .mock_spec_probability_tolerance) { errors <- c(errors, paste0( "Variable '", variable$name, "' missing proportions must sum to <= 1." @@ -685,6 +734,21 @@ print.mock_spec_validation_result <- function(x, ...) { } errors <- c(errors, .validate_missing_spec(variable)) + errors <- c(errors, .validate_provenance( + variable$provenance, + paste0("Variable '", variable$name, "'") + )) + if (is.null(variable$model_hint) || + length(variable$model_hint) != 1 || + is.na(variable$model_hint) || + !variable$model_hint %in% .mock_spec_model_hints) { + errors <- c(errors, paste0( + "Variable '", variable$name, + "' model_hint must be one of: ", + paste(.mock_spec_model_hints, collapse = ", "), + "." + )) + } if (variable$type == "continuous") { errors <- c(errors, .validate_range(variable$range, variable$name, "numeric")) @@ -710,7 +774,7 @@ print.mock_spec_validation_result <- function(x, ...) { allow_null = FALSE )) prop_sum <- sum(variable$proportions, na.rm = TRUE) - if (abs(prop_sum - 1) > 0.001) { + if (abs(prop_sum - 1) > .mock_spec_probability_tolerance) { errors <- c(errors, paste0("Variable '", variable$name, "' proportions must sum to 1.")) } } @@ -749,8 +813,19 @@ validate_mock_spec <- function(spec, n = NULL, strict = TRUE) { if (!is_mock_spec(spec)) { errors <- c(errors, "spec must be a mock_spec object.") } else { - if (is.null(spec$spec_version) || length(spec$spec_version) != 1 || is.na(spec$spec_version)) { - errors <- c(errors, "mock_spec must have a scalar spec_version.") + if (!.is_non_empty_string(spec$spec_version)) { + errors <- c(errors, "mock_spec must have a non-empty scalar spec_version.") + } + errors <- c(errors, .validate_provenance(spec$provenance, "mock_spec")) + if (is.null(spec$model_hint) || + length(spec$model_hint) != 1 || + is.na(spec$model_hint) || + !spec$model_hint %in% .mock_spec_model_hints) { + errors <- c(errors, paste0( + "mock_spec model_hint must be one of: ", + paste(.mock_spec_model_hints, collapse = ", "), + "." + )) } if (is.null(spec$variables) || !is.list(spec$variables)) { errors <- c(errors, "mock_spec variables must be a list.") diff --git a/development/adr/v04-hybrid-backend.md b/development/adr/v04-hybrid-backend.md index eca4420..63fc032 100644 --- a/development/adr/v04-hybrid-backend.md +++ b/development/adr/v04-hybrid-backend.md @@ -112,4 +112,9 @@ Production refactor should proceed in layers: - Whether `mock_spec` is internal-only or partially user-facing in v0.4.0. - How formula/dependency syntax enters from recodeflow or direct APIs. - How Table 1 / summary specifications become a future adapter. - +- How the legacy `var_row` shim used by v0.3 garbage helpers is replaced with + typed v0.4 post-processing specs. +- Empty, `NULL`, `n = 0`, and single-row input behavior across adapters and + backends. +- Seed discipline across native generation, post-processing, and the optional + `simstudy` backend. diff --git a/tests/testthat/test-direct-mock-api.R b/tests/testthat/test-direct-mock-api.R index 2248282..d66c087 100644 --- a/tests/testthat/test-direct-mock-api.R +++ b/tests/testthat/test-direct-mock-api.R @@ -16,6 +16,8 @@ test_that("mock_continuous creates a validated one-variable spec", { expect_equal(spec$variables$age$rtype, "integer") expect_equal(spec$provenance$adapter, "direct") expect_equal(spec$provenance$source, "mock_continuous") + expect_equal(spec$variables$age$provenance$adapter, "direct") + expect_equal(spec$variables$age$provenance$source, "mock_continuous") expect_true(validate_mock_spec(spec)$valid) }) @@ -32,7 +34,10 @@ test_that("mock_categorical creates a validated one-variable spec", { expect_equal(spec$variables$smoking$type, "categorical") expect_equal(spec$variables$smoking$levels, c("never", "former", "current")) expect_equal(spec$variables$smoking$proportions, c(0.5, 0.3, 0.2)) + expect_equal(spec$provenance$adapter, "direct") expect_equal(spec$provenance$source, "mock_categorical") + expect_equal(spec$variables$smoking$provenance$adapter, "direct") + expect_equal(spec$variables$smoking$provenance$source, "mock_categorical") expect_true(validate_mock_spec(spec)$valid) }) @@ -47,10 +52,78 @@ test_that("mock_date creates a validated one-variable spec", { expect_equal(spec$variables$interview_date$type, "date") expect_s3_class(spec$variables$interview_date$range, "Date") expect_equal(spec$model_hint, "native-postprocess") + expect_equal(spec$variables$interview_date$model_hint, "native-postprocess") + expect_equal(spec$provenance$adapter, "direct") expect_equal(spec$provenance$source, "mock_date") + expect_equal(spec$variables$interview_date$provenance$adapter, "direct") + expect_equal(spec$variables$interview_date$provenance$source, "mock_date") expect_true(validate_mock_spec(spec)$valid) }) +test_that("direct mock APIs are equivalent to explicit mock_spec wrappers", { + continuous_provenance <- list(adapter = "direct", source = "mock_continuous") + expect_equal( + mock_continuous("age", range = c(18, 85), rtype = "integer"), + mock_spec( + mock_spec_continuous( + "age", + range = c(18, 85), + rtype = "integer", + provenance = continuous_provenance + ), + provenance = continuous_provenance + ) + ) + + categorical_provenance <- list(adapter = "direct", source = "mock_categorical") + expect_equal( + mock_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2) + ), + mock_spec( + mock_spec_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + provenance = categorical_provenance + ), + provenance = categorical_provenance + ) + ) + + date_provenance <- list(adapter = "direct", source = "mock_date") + expect_equal( + mock_date( + "interview_date", + range = as.Date(c("2001-01-01", "2005-12-31")) + ), + mock_spec( + mock_spec_date( + "interview_date", + range = as.Date(c("2001-01-01", "2005-12-31")), + provenance = date_provenance + ), + provenance = date_provenance, + model_hint = "native-postprocess" + ) + ) +}) + +test_that("direct mock APIs keep adapter provenance fixed as direct", { + spec <- mock_continuous( + "age", + range = c(18, 85), + provenance = list(adapter = "not-direct", source = "custom-note") + ) + + expect_equal(spec$provenance$adapter, "direct") + expect_equal(spec$provenance$source, "custom-note") + expect_equal(spec$variables$age$provenance$adapter, "direct") + expect_equal(spec$variables$age$provenance$source, "custom-note") +}) + test_that("direct mock APIs validate immediately", { expect_error( mock_continuous( diff --git a/tests/testthat/test-mock-spec.R b/tests/testthat/test-mock-spec.R index edc3f41..7d748ef 100644 --- a/tests/testthat/test-mock-spec.R +++ b/tests/testthat/test-mock-spec.R @@ -188,6 +188,59 @@ test_that("mock_spec preserves spec_version and rejects missing spec_version", { expect_false(result$valid) expect_true(any(grepl("spec_version", result$errors))) + + spec$spec_version <- "" + result <- validate_mock_spec(spec, strict = FALSE) + + expect_false(result$valid) + expect_true(any(grepl("spec_version", result$errors))) +}) + +test_that("validate_mock_spec checks provenance and model_hint after mutation", { + spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85))) + + spec$provenance <- "not-a-list" + result <- validate_mock_spec(spec, strict = FALSE) + expect_false(result$valid) + expect_true(any(grepl("provenance must be a list", result$errors))) + + spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85))) + spec$provenance$adapter <- "" + result <- validate_mock_spec(spec, strict = FALSE) + expect_false(result$valid) + expect_true(any(grepl("provenance\\$adapter", result$errors))) + + spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85))) + spec$provenance$source <- NA_character_ + result <- validate_mock_spec(spec, strict = FALSE) + expect_false(result$valid) + expect_true(any(grepl("provenance\\$source", result$errors))) + + spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85))) + spec$model_hint <- "magic" + result <- validate_mock_spec(spec, strict = FALSE) + expect_false(result$valid) + expect_true(any(grepl("mock_spec model_hint", result$errors))) + + spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85))) + spec$variables$age$provenance$adapter <- "" + spec$variables$age$model_hint <- "magic" + result <- validate_mock_spec(spec, strict = FALSE) + expect_false(result$valid) + expect_true(any(grepl("Variable 'age' provenance\\$adapter", result$errors))) + expect_true(any(grepl("Variable 'age' model_hint", result$errors))) +}) + +test_that("proportion sums use floating-point tolerance consistently", { + spec <- mock_spec(mock_spec_categorical( + "smoking", + levels = c("never", "former"), + proportions = c(0.5, 0.5 + 1e-15), + missing_codes = c("7", "9"), + missing_proportions = c(0.5, 0.5 + 1e-15) + )) + + expect_true(validate_mock_spec(spec)$valid) }) test_that("print.mock_spec_validation_result summarizes errors", { From bd7b389272b40fe5f6890ef85d13f833a2ec5684 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 09:32:13 -0400 Subject: [PATCH 05/18] Add recodeflow mock_spec adapter --- NAMESPACE | 1 + NEWS.md | 2 + R/mock_spec_recodeflow.R | 416 +++++++++++++++++++++ tests/testthat/test-recodeflow-mock-spec.R | 112 ++++++ 4 files changed, 531 insertions(+) create mode 100644 R/mock_spec_recodeflow.R create mode 100644 tests/testthat/test-recodeflow-mock-spec.R diff --git a/NAMESPACE b/NAMESPACE index cb0f2e3..b1b6f71 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,7 @@ export(mock_spec) export(mock_spec_categorical) export(mock_spec_continuous) export(mock_spec_date) +export(mock_spec_from_recodeflow) export(parse_range_notation) export(parse_variable_start) export(read_mock_data_config) diff --git a/NEWS.md b/NEWS.md index 80afcd5..9b9d9c5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,8 @@ - Added direct specification helpers `mock_continuous()`, `mock_categorical()`, and `mock_date()` for simple use without recodeflow-style metadata tables. +- Added `mock_spec_from_recodeflow()` to adapt recodeflow-style `variables` + and `variable_details` metadata into validated `mock_spec` objects. - Added forward-compatible specification fields: `spec_version`, `provenance`, and `model_hint`. - Existing v0.3 generator APIs remain available while v0.4 internals are built. diff --git a/R/mock_spec_recodeflow.R b/R/mock_spec_recodeflow.R new file mode 100644 index 0000000..0274ce5 --- /dev/null +++ b/R/mock_spec_recodeflow.R @@ -0,0 +1,416 @@ +# ============================================================================== +# MockData v0.4 Recodeflow Adapter +# ============================================================================== +# Converts recodeflow-style variables and variable_details metadata into the +# normalized mock_spec representation. +# ============================================================================== + +.read_recodeflow_table <- function(x, label) { + if (is.data.frame(x)) { + return(x) + } + + if (is.character(x) && length(x) == 1) { + if (!file.exists(x)) { + stop(label, " file does not exist: ", x, call. = FALSE) + } + return(read.csv(x, stringsAsFactors = FALSE, check.names = FALSE)) + } + + stop(label, " must be a data frame or a single CSV path.", call. = FALSE) +} + +.is_blank <- function(x) { + is.null(x) || length(x) == 0 || is.na(x[1]) || trimws(as.character(x[1])) == "" +} + +.row_value <- function(row, name, default = NA) { + if (!name %in% names(row)) { + return(default) + } + + value <- row[[name]][1] + if (length(value) == 0) { + return(default) + } + + value +} + +.row_character <- function(row, name, default = NA_character_) { + value <- .row_value(row, name, default) + if (.is_blank(value)) { + return(default) + } + as.character(value) +} + +.row_numeric <- function(row, name, default = NA_real_) { + value <- .row_value(row, name, default) + if (.is_blank(value)) { + return(default) + } + suppressWarnings(as.numeric(value)) +} + +.recodeflow_required_columns <- function(data, required, label) { + missing <- setdiff(required, names(data)) + if (length(missing) > 0) { + stop(label, " is missing required column(s): ", paste(missing, collapse = ", "), call. = FALSE) + } +} + +.filter_recodeflow_by_database <- function(data, databaseStart, allow_empty = TRUE) { + if (is.null(databaseStart) || !"databaseStart" %in% names(data)) { + return(data) + } + + data[.database_start_matches(data$databaseStart, databaseStart, allow_empty = allow_empty), , drop = FALSE] +} + +.filter_recodeflow_details <- function(variable_details, variable, databaseStart) { + if (is.null(variable_details)) { + return(NULL) + } + + details <- variable_details[variable_details$variable == variable, , drop = FALSE] + .filter_recodeflow_by_database(details, databaseStart, allow_empty = TRUE) +} + +.recodeflow_variable_kind <- function(var_row) { + rtype <- tolower(.row_character(var_row, "rType", "")) + variable_type <- tolower(.row_character(var_row, "variableType", "")) + + if (rtype == "date" || variable_type == "date") { + return("date") + } + if (variable_type == "categorical" || rtype %in% c("factor", "character", "logical")) { + return("categorical") + } + if (variable_type == "continuous" || rtype %in% c("integer", "double", "numeric")) { + return("continuous") + } + + stop( + "Variable '", .row_character(var_row, "variable", ""), + "' has unsupported variableType/rType combination: variableType = '", + variable_type, "', rType = '", rtype, "'.", + call. = FALSE + ) +} + +.recodeflow_rtype <- function(var_row, kind) { + rtype <- tolower(.row_character(var_row, "rType", "")) + if (rtype != "") { + if (rtype == "numeric") { + return("double") + } + return(rtype) + } + + switch( + kind, + continuous = "double", + categorical = "factor", + date = "date" + ) +} + +.parse_single_date <- function(value) { + if (.is_blank(value)) { + return(NULL) + } + + parsed <- parse_range_notation(paste0("[", value, ",", value, "]")) + if (!is.null(parsed) && identical(parsed$type, "date")) { + return(c(parsed$min, parsed$max)) + } + + NULL +} + +.recodeflow_valid_rows <- function(details) { + if (is.null(details) || nrow(details) == 0) { + return(details) + } + + rec_start <- as.character(details$recStart) + rec_end <- if ("recEnd" %in% names(details)) as.character(details$recEnd) else rep("", nrow(details)) + + keep <- !is.na(rec_start) & + rec_start != "" & + rec_start != "else" & + !grepl("^garbage_", rec_start, ignore.case = TRUE) & + !grepl("^NA::", rec_end) & + !grepl("^DerivedVar::", rec_start) & + !grepl("^Func::", rec_end) + + details[keep, , drop = FALSE] +} + +.recodeflow_range <- function(details, variable, kind) { + valid_rows <- .recodeflow_valid_rows(details) + if (is.null(valid_rows) || nrow(valid_rows) == 0) { + stop("Variable '", variable, "' has no valid recodeflow detail rows for range extraction.", call. = FALSE) + } + + for (i in seq_len(nrow(valid_rows))) { + rec_start <- valid_rows$recStart[i] + parsed <- parse_range_notation(rec_start) + + if (kind == "date") { + if (!is.null(parsed) && identical(parsed$type, "date")) { + return(c(parsed$min, parsed$max)) + } + + single_date <- .parse_single_date(rec_start) + if (!is.null(single_date)) { + return(single_date) + } + } else { + if (!is.null(parsed) && parsed$type %in% c("integer", "continuous", "single_value")) { + return(c(parsed$min, parsed$max)) + } + } + } + + stop("Variable '", variable, "' has no parseable ", kind, " range in recStart.", call. = FALSE) +} + +.recodeflow_missing <- function(details) { + if (is.null(details) || nrow(details) == 0 || !"recEnd" %in% names(details)) { + return(list(codes = character(0), proportions = numeric(0))) + } + + is_missing <- grepl("^NA::", details$recEnd) & + !is.na(details$recStart) & + details$recStart != "" & + details$recStart != "else" + + missing_rows <- details[is_missing, , drop = FALSE] + if (nrow(missing_rows) == 0) { + return(list(codes = character(0), proportions = numeric(0))) + } + + proportions <- if ("proportion" %in% names(missing_rows)) missing_rows$proportion else rep(NA_real_, nrow(missing_rows)) + proportions[is.na(proportions)] <- 0 + + list( + codes = as.character(missing_rows$recStart), + proportions = as.numeric(proportions) + ) +} + +.recodeflow_distribution <- function(var_row, details) { + distribution <- tolower(.row_character(var_row, "distribution", "")) + if (distribution != "") { + return(distribution) + } + + params <- tryCatch( + extract_distribution_params(details), + error = function(e) list(distribution = "uniform") + ) + params$distribution %||% "uniform" +} + +.recodeflow_garbage_rules <- function(var_row) { + rules <- list() + + low_prop <- .row_numeric(var_row, "garbage_low_prop") + low_range <- .row_character(var_row, "garbage_low_range", "") + if (low_range == "[;]") { + low_range <- "" + } + if ((!is.na(low_prop) && low_prop > 0) || low_range != "") { + rules$low <- list(proportion = low_prop, range = low_range) + } + + high_prop <- .row_numeric(var_row, "garbage_high_prop") + high_range <- .row_character(var_row, "garbage_high_range", "") + if (high_range == "[;]") { + high_range <- "" + } + if ((!is.na(high_prop) && high_prop > 0) || high_range != "") { + rules$high <- list(proportion = high_prop, range = high_range) + } + + rules +} + +.recodeflow_provenance <- function(variable, databaseStart = NULL) { + provenance <- list(adapter = "recodeflow", source = variable) + if (!is.null(databaseStart)) { + provenance$databaseStart <- paste(databaseStart, collapse = ",") + } + provenance +} + +.recodeflow_to_spec_variable <- function(var_row, details, databaseStart) { + variable <- .row_character(var_row, "variable") + kind <- .recodeflow_variable_kind(var_row) + rtype <- .recodeflow_rtype(var_row, kind) + provenance <- .recodeflow_provenance(variable, databaseStart) + missing <- .recodeflow_missing(details) + garbage_rules <- .recodeflow_garbage_rules(var_row) + + if (kind == "categorical") { + proportions <- extract_proportions(details, variable_name = variable) + if (length(proportions$categories) == 0) { + stop("Variable '", variable, "' has no valid categorical levels.", call. = FALSE) + } + + return(mock_spec_categorical( + name = variable, + levels = proportions$categories, + proportions = proportions$category_proportions, + rtype = rtype, + missing_codes = names(proportions$missing), + missing_proportions = as.numeric(unlist(proportions$missing, use.names = FALSE)), + garbage_rules = garbage_rules, + provenance = provenance, + model_hint = "native" + )) + } + + if (kind == "continuous") { + distribution <- .recodeflow_distribution(var_row, details) + + return(mock_spec_continuous( + name = variable, + range = .recodeflow_range(details, variable, "continuous"), + distribution = distribution, + mean = .row_numeric(var_row, "mean"), + sd = .row_numeric(var_row, "sd"), + rtype = rtype, + missing_codes = missing$codes, + missing_proportions = missing$proportions, + garbage_rules = garbage_rules, + provenance = provenance, + model_hint = "native" + )) + } + + source_format <- .row_character(var_row, "sourceFormat", "analysis") + + .new_mock_spec_variable( + name = variable, + type = "date", + rtype = rtype, + distribution = .recodeflow_distribution(var_row, details), + range = .recodeflow_range(details, variable, "date"), + source_format = source_format, + missing_codes = missing$codes, + missing_proportions = missing$proportions, + garbage_rules = garbage_rules, + provenance = provenance, + model_hint = "native-postprocess", + rate = .row_numeric(var_row, "rate"), + shape = .row_numeric(var_row, "shape"), + followup_min = .row_numeric(var_row, "followup_min"), + followup_max = .row_numeric(var_row, "followup_max"), + event_prop = .row_numeric(var_row, "event_prop") + ) +} + +#' Convert recodeflow metadata to a MockData specification +#' +#' `mock_spec_from_recodeflow()` adapts recodeflow-style `variables` and +#' `variable_details` metadata into the normalized v0.4 `mock_spec` shape. It +#' returns a validated specification; it does not generate data. +#' +#' @param variables Data frame or CSV path for recodeflow-style `variables` +#' metadata. +#' @param variable_details Data frame, CSV path, or `NULL` for recodeflow-style +#' `variable_details` metadata. +#' @param databaseStart Optional database/cycle token used to filter metadata by +#' exact comma-separated `databaseStart` values. +#' @param role Character vector of role tokens to include. Defaults to +#' `"enabled"`. Use `NULL` to skip role filtering. +#' @param exclude_derived Logical. If `TRUE`, exclude variables identified by +#' `DerivedVar::` or `Func::` rows in `variable_details`. +#' @param spec_version Character version of the specification shape. +#' @param model_hint Backend hint for the returned specification. +#' +#' @return A validated `mock_spec` object. +#' @family mock specification APIs +#' @seealso [mock_spec()], [mock_continuous()], [mock_categorical()], +#' [mock_date()] +#' +#' @examples +#' variables <- data.frame( +#' variable = "age", +#' variableType = "Continuous", +#' rType = "integer", +#' role = "enabled", +#' distribution = "uniform" +#' ) +#' details <- data.frame( +#' variable = "age", +#' recStart = "[18, 85]", +#' recEnd = "copy", +#' proportion = 1 +#' ) +#' spec <- mock_spec_from_recodeflow(variables, details) +#' validate_mock_spec(spec) +#' +#' @export +mock_spec_from_recodeflow <- function(variables, + variable_details = NULL, + databaseStart = NULL, + role = "enabled", + exclude_derived = TRUE, + spec_version = .mock_spec_version, + model_hint = "auto") { + variables <- .read_recodeflow_table(variables, "variables") + variables <- .migrate_garbage_aliases(variables) + .recodeflow_required_columns(variables, "variable", "variables") + + if (!is.null(variable_details)) { + variable_details <- .read_recodeflow_table(variable_details, "variable_details") + .recodeflow_required_columns(variable_details, c("variable", "recStart"), "variable_details") + } + + if (!is.null(role)) { + if (!"role" %in% names(variables)) { + stop("variables must have a 'role' column when role filtering is requested.", call. = FALSE) + } + variables <- variables[.role_matches(variables$role, role, ignore.case = TRUE), , drop = FALSE] + } + + variables <- .filter_recodeflow_by_database(variables, databaseStart, allow_empty = TRUE) + + if (nrow(variables) == 0) { + stop("No variables matched the requested role/database filters.", call. = FALSE) + } + + if (isTRUE(exclude_derived) && !is.null(variable_details)) { + derived <- identify_derived_vars(variables, variable_details) + variables <- variables[!variables$variable %in% derived, , drop = FALSE] + } + + if (nrow(variables) == 0) { + stop("No non-derived variables remain after filtering.", call. = FALSE) + } + + spec_variables <- lapply(seq_len(nrow(variables)), function(i) { + var_row <- variables[i, , drop = FALSE] + details <- .filter_recodeflow_details(variable_details, var_row$variable[1], databaseStart) + .recodeflow_to_spec_variable(var_row, details, databaseStart) + }) + + provenance <- list(adapter = "recodeflow", source = "variables+variable_details") + if (!is.null(databaseStart)) { + provenance$databaseStart <- paste(databaseStart, collapse = ",") + } + if (!is.null(role)) { + provenance$role <- paste(role, collapse = ",") + } + + mock_spec( + spec_variables, + spec_version = spec_version, + provenance = provenance, + model_hint = model_hint + ) +} diff --git a/tests/testthat/test-recodeflow-mock-spec.R b/tests/testthat/test-recodeflow-mock-spec.R new file mode 100644 index 0000000..97db9e7 --- /dev/null +++ b/tests/testthat/test-recodeflow-mock-spec.R @@ -0,0 +1,112 @@ +minimal_example_path <- function(...) { + file.path("..", "..", "inst", "extdata", "minimal-example", ...) +} + +test_that("mock_spec_from_recodeflow converts minimal metadata", { + variables <- read.csv( + minimal_example_path("variables.csv"), + stringsAsFactors = FALSE, + check.names = FALSE + ) + variable_details <- read.csv( + minimal_example_path("variable_details.csv"), + stringsAsFactors = FALSE, + check.names = FALSE + ) + + spec <- mock_spec_from_recodeflow(variables, variable_details) + + expect_s3_class(spec, "mock_spec") + expect_equal(spec$provenance$adapter, "recodeflow") + expect_true(validate_mock_spec(spec)$valid) + expect_false("BMI_derived" %in% names(spec$variables)) + expect_true(all(c("age", "smoking", "interview_date") %in% names(spec$variables))) + + expect_equal(spec$variables$age$type, "continuous") + expect_equal(spec$variables$age$rtype, "integer") + expect_equal(spec$variables$age$distribution, "normal") + expect_equal(spec$variables$age$range, c(18, 100)) + expect_equal(spec$variables$age$missing_codes, c("997", "998", "999")) + expect_length(spec$variables$age$garbage_rules, 0) + + expect_equal(spec$variables$smoking$type, "categorical") + expect_equal(spec$variables$smoking$levels, c("1", "2", "3")) + expect_equal(sum(spec$variables$smoking$proportions), 1) + + expect_equal(spec$variables$interview_date$type, "date") + expect_s3_class(spec$variables$interview_date$range, "Date") + expect_equal(spec$variables$interview_date$source_format, "analysis") +}) + +test_that("mock_spec_from_recodeflow filters exact role and databaseStart tokens", { + variables <- data.frame( + variable = c("age", "disabled_age", "cycle10_age"), + variableType = "Continuous", + rType = "integer", + role = c("enabled", "disabled", "enabled"), + databaseStart = c("cycle1, cycle2", "cycle1", "cycle10"), + distribution = "uniform", + stringsAsFactors = FALSE + ) + details <- data.frame( + variable = c("age", "disabled_age", "cycle10_age"), + recStart = c("[18, 85]", "[18, 85]", "[18, 85]"), + recEnd = "copy", + databaseStart = c("cycle1", "cycle1", "cycle10"), + proportion = 1, + stringsAsFactors = FALSE + ) + + spec <- mock_spec_from_recodeflow( + variables, + details, + databaseStart = "cycle1", + role = "enabled" + ) + + expect_named(spec$variables, "age") +}) + +test_that("mock_spec_from_recodeflow preserves garbage and survival fields", { + variables <- read.csv( + minimal_example_path("variables.csv"), + stringsAsFactors = FALSE, + check.names = FALSE + ) + variable_details <- read.csv( + minimal_example_path("variable_details.csv"), + stringsAsFactors = FALSE, + check.names = FALSE + ) + + spec <- mock_spec_from_recodeflow(variables, variable_details) + + expect_equal(spec$variables$BMI$garbage_rules$low$proportion, 0.02) + expect_equal(spec$variables$BMI$garbage_rules$low$range, "[-10;15])") + expect_equal(spec$variables$BMI$garbage_rules$high$proportion, 0.01) + + expect_equal(spec$variables$primary_event_date$distribution, "gompertz") + expect_equal(spec$variables$primary_event_date$event_prop, 0.3) + expect_equal(spec$variables$primary_event_date$followup_max, 5475) +}) + +test_that("mock_spec_from_recodeflow validates adapter inputs", { + variables <- data.frame( + variable = "age", + variableType = "Continuous", + rType = "integer", + role = "enabled", + stringsAsFactors = FALSE + ) + + expect_error( + mock_spec_from_recodeflow(variables, variable_details = NULL), + "no valid recodeflow detail rows" + ) + + variables$role <- "disabled" + expect_error( + mock_spec_from_recodeflow(variables, data.frame(variable = "age", recStart = "[18, 85]")), + "No variables matched" + ) +}) From b1895f5e0a9a8d107ffdf192cc38c098087933db Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 11:10:54 -0400 Subject: [PATCH 06/18] Harden recodeflow mock_spec adapter --- NEWS.md | 5 +- R/mock_spec_recodeflow.R | 62 +++++++++++- tests/testthat/test-recodeflow-mock-spec.R | 104 ++++++++++++++++++++- 3 files changed, 164 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9b9d9c5..b6558a3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,7 +10,10 @@ `mock_categorical()`, and `mock_date()` for simple use without recodeflow-style metadata tables. - Added `mock_spec_from_recodeflow()` to adapt recodeflow-style `variables` - and `variable_details` metadata into validated `mock_spec` objects. + and `variable_details` metadata into validated `mock_spec` objects while + preserving role/database filtering, categorical proportions, `recEnd` + missing-code semantics, valid ranges, garbage rules, date ranges, and + survival/date fields. - Added forward-compatible specification fields: `spec_version`, `provenance`, and `model_hint`. - Existing v0.3 generator APIs remain available while v0.4 internals are built. diff --git a/R/mock_spec_recodeflow.R b/R/mock_spec_recodeflow.R index 0274ce5..f713a15 100644 --- a/R/mock_spec_recodeflow.R +++ b/R/mock_spec_recodeflow.R @@ -14,7 +14,12 @@ if (!file.exists(x)) { stop(label, " file does not exist: ", x, call. = FALSE) } - return(read.csv(x, stringsAsFactors = FALSE, check.names = FALSE)) + return(read.csv( + x, + stringsAsFactors = FALSE, + check.names = FALSE, + na.strings = c("", "NA") + )) } stop(label, " must be a data frame or a single CSV path.", call. = FALSE) @@ -50,7 +55,18 @@ if (.is_blank(value)) { return(default) } - suppressWarnings(as.numeric(value)) + + numeric_value <- suppressWarnings(as.numeric(value)) + if (is.na(numeric_value)) { + stop( + "Column '", name, "' for variable '", + .row_character(row, "variable", ""), + "' must be numeric; got '", as.character(value), "'.", + call. = FALSE + ) + } + + numeric_value } .recodeflow_required_columns <- function(data, required, label) { @@ -61,9 +77,15 @@ } .filter_recodeflow_by_database <- function(data, databaseStart, allow_empty = TRUE) { - if (is.null(databaseStart) || !"databaseStart" %in% names(data)) { + if (is.null(databaseStart)) { return(data) } + if (!"databaseStart" %in% names(data)) { + stop( + "databaseStart filtering was requested, but metadata has no 'databaseStart' column.", + call. = FALSE + ) + } data[.database_start_matches(data$databaseStart, databaseStart, allow_empty = allow_empty), , drop = FALSE] } @@ -143,7 +165,7 @@ !grepl("^garbage_", rec_start, ignore.case = TRUE) & !grepl("^NA::", rec_end) & !grepl("^DerivedVar::", rec_start) & - !grepl("^Func::", rec_end) + !grepl("^Func::", rec_start) details[keep, , drop = FALSE] } @@ -209,7 +231,16 @@ params <- tryCatch( extract_distribution_params(details), - error = function(e) list(distribution = "uniform") + error = function(e) { + warning( + "Could not infer distribution for variable '", + .row_character(var_row, "variable", ""), + "' from details; using uniform. Reason: ", + conditionMessage(e), + call. = FALSE + ) + list(distribution = "uniform") + } ) params$distribution %||% "uniform" } @@ -319,6 +350,20 @@ #' `variable_details` metadata into the normalized v0.4 `mock_spec` shape. It #' returns a validated specification; it does not generate data. #' +#' @details +#' This adapter preserves recodeflow semantics instead of treating metadata as a +#' generic table. It uses exact role and `databaseStart` token matching, parses +#' valid ranges from `recStart`, classifies missing codes from `recEnd` values +#' that begin with `NA::`, preserves categorical levels and proportions, carries +#' `garbage_*` settings into `garbage_rules`, and stores survival/date fields +#' such as `rate`, `shape`, `followup_min`, `followup_max`, and `event_prop` on +#' date variables for later backend milestones. +#' +#' By default, variables identified by `DerivedVar::` or `Func::` rows are +#' excluded because they should be evaluated after raw mock variables are +#' generated. Set `exclude_derived = FALSE` only when you want those rows to +#' appear in the adapter input and fail or be handled by later formula support. +#' #' @param variables Data frame or CSV path for recodeflow-style `variables` #' metadata. #' @param variable_details Data frame, CSV path, or `NULL` for recodeflow-style @@ -386,6 +431,13 @@ mock_spec_from_recodeflow <- function(variables, if (isTRUE(exclude_derived) && !is.null(variable_details)) { derived <- identify_derived_vars(variables, variable_details) + removed <- intersect(variables$variable, derived) + if (length(removed) > 0) { + message( + "Excluding derived recodeflow variable(s): ", + paste(removed, collapse = ", ") + ) + } variables <- variables[!variables$variable %in% derived, , drop = FALSE] } diff --git a/tests/testthat/test-recodeflow-mock-spec.R b/tests/testthat/test-recodeflow-mock-spec.R index 97db9e7..51fdcc4 100644 --- a/tests/testthat/test-recodeflow-mock-spec.R +++ b/tests/testthat/test-recodeflow-mock-spec.R @@ -31,7 +31,7 @@ test_that("mock_spec_from_recodeflow converts minimal metadata", { expect_equal(spec$variables$smoking$type, "categorical") expect_equal(spec$variables$smoking$levels, c("1", "2", "3")) - expect_equal(sum(spec$variables$smoking$proportions), 1) + expect_equal(spec$variables$smoking$proportions, c(0.5, 0.3, 0.17) / 0.97) expect_equal(spec$variables$interview_date$type, "date") expect_s3_class(spec$variables$interview_date$range, "Date") @@ -84,6 +84,7 @@ test_that("mock_spec_from_recodeflow preserves garbage and survival fields", { expect_equal(spec$variables$BMI$garbage_rules$low$proportion, 0.02) expect_equal(spec$variables$BMI$garbage_rules$low$range, "[-10;15])") expect_equal(spec$variables$BMI$garbage_rules$high$proportion, 0.01) + expect_equal(spec$variables$BMI$garbage_rules$high$range, "[60;150]") expect_equal(spec$variables$primary_event_date$distribution, "gompertz") expect_equal(spec$variables$primary_event_date$event_prop, 0.3) @@ -110,3 +111,104 @@ test_that("mock_spec_from_recodeflow validates adapter inputs", { "No variables matched" ) }) + +test_that("mock_spec_from_recodeflow fails loudly on missing databaseStart column", { + variables <- data.frame( + variable = "age", + variableType = "Continuous", + rType = "integer", + role = "enabled", + stringsAsFactors = FALSE + ) + details <- data.frame( + variable = "age", + recStart = "[18, 85]", + recEnd = "copy", + proportion = 1, + stringsAsFactors = FALSE + ) + + expect_error( + mock_spec_from_recodeflow(variables, details, databaseStart = "cycle1"), + "no 'databaseStart' column" + ) +}) + +test_that("mock_spec_from_recodeflow rejects non-numeric scalar fields", { + variables <- data.frame( + variable = "age", + variableType = "Continuous", + rType = "integer", + role = "enabled", + distribution = "normal", + mean = "middle", + sd = 10, + stringsAsFactors = FALSE + ) + details <- data.frame( + variable = "age", + recStart = "[18, 85]", + recEnd = "copy", + proportion = 1, + stringsAsFactors = FALSE + ) + + expect_error( + mock_spec_from_recodeflow(variables, details), + "must be numeric" + ) +}) + +test_that("mock_spec_from_recodeflow excludes Func rows from valid ranges", { + variables <- data.frame( + variable = "age", + variableType = "Continuous", + rType = "integer", + role = "enabled", + distribution = "uniform", + stringsAsFactors = FALSE + ) + details <- data.frame( + variable = c("age", "age"), + recStart = c("Func::age_cleanup", "[18, 85]"), + recEnd = c("copy", "copy"), + proportion = c(NA, 1), + stringsAsFactors = FALSE + ) + + spec <- mock_spec_from_recodeflow(variables, details) + + expect_equal(spec$variables$age$range, c(18, 85)) +}) + +test_that("mock_spec_from_recodeflow matches direct adapter specs modulo provenance", { + variables <- data.frame( + variable = "age", + variableType = "Continuous", + rType = "integer", + role = "enabled", + distribution = "uniform", + stringsAsFactors = FALSE + ) + details <- data.frame( + variable = "age", + recStart = "[18, 85]", + recEnd = "copy", + proportion = 1, + stringsAsFactors = FALSE + ) + + recodeflow_spec <- mock_spec_from_recodeflow(variables, details) + direct_spec <- mock_continuous( + "age", + range = c(18, 85), + rtype = "integer", + missing_codes = character(0) + ) + + recodeflow_spec$provenance <- direct_spec$provenance + recodeflow_spec$variables$age$provenance <- direct_spec$variables$age$provenance + recodeflow_spec$variables$age$model_hint <- direct_spec$variables$age$model_hint + + expect_equal(recodeflow_spec, direct_spec) +}) From a6aa4cf4ab3f00aca6a3a6ef4857b1512c8bee00 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 11:24:19 -0400 Subject: [PATCH 07/18] Add native mock_spec backend --- NAMESPACE | 1 + NEWS.md | 2 + R/mock_spec_native.R | 264 +++++++++++++++++++++++++++ tests/testthat/test-native-backend.R | 123 +++++++++++++ 4 files changed, 390 insertions(+) create mode 100644 R/mock_spec_native.R create mode 100644 tests/testthat/test-native-backend.R diff --git a/NAMESPACE b/NAMESPACE index b1b6f71..76b5988 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ export(create_wide_survival_data) export(extract_distribution_params) export(extract_proportions) export(generate_garbage_values) +export(generate_mock_data_native) export(get_cycle_variables) export(get_enabled_variables) export(get_raw_var_dependencies) diff --git a/NEWS.md b/NEWS.md index b6558a3..5e14789 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,8 @@ preserving role/database filtering, categorical proportions, `recEnd` missing-code semantics, valid ranges, garbage rules, date ranges, and survival/date fields. +- Added `generate_mock_data_native()` to generate baseline valid mock data from + `mock_spec` objects with the native R backend. - Added forward-compatible specification fields: `spec_version`, `provenance`, and `model_hint`. - Existing v0.3 generator APIs remain available while v0.4 internals are built. diff --git a/R/mock_spec_native.R b/R/mock_spec_native.R new file mode 100644 index 0000000..1e0d812 --- /dev/null +++ b/R/mock_spec_native.R @@ -0,0 +1,264 @@ +# ============================================================================== +# MockData v0.4 Native Backend +# ============================================================================== +# Baseline native generation from mock_spec. Post-processing for missing codes, +# garbage, diagnostics, and richer rType handling lands in later milestones. +# ============================================================================== + +.with_mock_seed <- function(seed, expr) { + if (is.null(seed)) { + return(force(expr)) + } + + if (!is.numeric(seed) || length(seed) != 1 || is.na(seed) || seed != floor(seed)) { + stop("seed must be a single whole number.", call. = FALSE) + } + + had_seed <- exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE) + if (had_seed) { + old_seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE) + } + + on.exit({ + if (had_seed) { + assign(".Random.seed", old_seed, envir = .GlobalEnv) + } else if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) { + rm(".Random.seed", envir = .GlobalEnv) + } + }, add = TRUE) + + set.seed(seed) + force(expr) +} + +.empty_native_data <- function(n) { + data.frame(row.names = seq_len(n)) +} + +.sample_indices <- function(n_levels, n, prob = NULL) { + if (n == 0) { + return(integer(0)) + } + sample.int(n_levels, size = n, replace = TRUE, prob = prob) +} + +.native_truncated_normal <- function(n, mean, sd, range) { + if (n == 0) { + return(numeric(0)) + } + + values <- rep(NA_real_, n) + remaining <- seq_len(n) + attempts <- 0 + max_attempts <- 100 + + while (length(remaining) > 0 && attempts < max_attempts) { + draws <- stats::rnorm(length(remaining), mean = mean, sd = sd) + valid <- draws >= range[[1]] & draws <= range[[2]] + values[remaining[valid]] <- draws[valid] + remaining <- remaining[!valid] + attempts <- attempts + 1 + } + + if (length(remaining) > 0) { + warning( + "Could not fill all truncated-normal values by rejection sampling; ", + "using uniform draws for the remaining values.", + call. = FALSE + ) + values[remaining] <- stats::runif(length(remaining), range[[1]], range[[2]]) + } + + values +} + +.coerce_native_continuous <- function(values, rtype, variable_name) { + if (rtype == "integer") { + return(as.integer(round(values))) + } + if (rtype %in% c("double", "numeric")) { + return(as.numeric(values)) + } + + stop( + "Variable '", variable_name, "' has unsupported native continuous rType '", + rtype, "'.", + call. = FALSE + ) +} + +.coerce_native_categorical <- function(values, levels, rtype, variable_name) { + if (rtype == "factor") { + return(factor(values, levels = levels)) + } + if (rtype == "character") { + return(as.character(values)) + } + if (rtype == "integer") { + converted <- suppressWarnings(as.integer(values)) + if (any(is.na(converted) & !is.na(values))) { + stop( + "Variable '", variable_name, + "' integer categorical generation requires integer-like levels.", + call. = FALSE + ) + } + return(converted) + } + if (rtype %in% c("double", "numeric")) { + converted <- suppressWarnings(as.numeric(values)) + if (any(is.na(converted) & !is.na(values))) { + stop( + "Variable '", variable_name, + "' numeric categorical generation requires numeric-like levels.", + call. = FALSE + ) + } + return(converted) + } + if (rtype == "logical") { + if (!all(values %in% c("TRUE", "FALSE", "true", "false", "1", "0", TRUE, FALSE))) { + stop( + "Variable '", variable_name, + "' logical categorical generation requires TRUE/FALSE or 1/0 levels.", + call. = FALSE + ) + } + return(values %in% c("TRUE", "true", "1", TRUE)) + } + + stop( + "Variable '", variable_name, "' has unsupported native categorical rType '", + rtype, "'.", + call. = FALSE + ) +} + +.coerce_native_date <- function(values, rtype, variable_name) { + if (rtype == "date") { + return(values) + } + if (rtype == "character") { + return(as.character(values)) + } + + stop( + "Variable '", variable_name, "' has unsupported native date rType '", + rtype, "'.", + call. = FALSE + ) +} + +.generate_native_continuous <- function(variable, n) { + distribution <- tolower(variable$distribution %||% "uniform") + + if (distribution == "uniform") { + values <- stats::runif(n, variable$range[[1]], variable$range[[2]]) + } else if (distribution == "normal") { + values <- .native_truncated_normal(n, variable$mean, variable$sd, variable$range) + } else { + stop( + "Native backend does not yet support continuous distribution '", + distribution, "' for variable '", variable$name, "'.", + call. = FALSE + ) + } + + .coerce_native_continuous(values, variable$rtype, variable$name) +} + +.generate_native_categorical <- function(variable, n) { + levels <- as.character(variable$levels) + prob <- variable$proportions + if (is.null(prob)) { + prob <- rep(1 / length(levels), length(levels)) + } + + values <- levels[.sample_indices(length(levels), n, prob)] + .coerce_native_categorical(values, levels, variable$rtype, variable$name) +} + +.generate_native_date <- function(variable, n) { + distribution <- tolower(variable$distribution %||% "uniform") + if (distribution != "uniform") { + stop( + "Native backend does not yet support date distribution '", + distribution, "' for variable '", variable$name, "'.", + call. = FALSE + ) + } + + if (n == 0) { + values <- as.Date(character(0)) + } else { + range_numeric <- as.integer(variable$range) + offsets <- .sample_indices( + range_numeric[[2]] - range_numeric[[1]] + 1, + n + ) - 1 + values <- as.Date(range_numeric[[1]] + offsets, origin = "1970-01-01") + } + + .coerce_native_date(values, variable$rtype, variable$name) +} + +.generate_native_variable <- function(variable, n) { + if (variable$type == "continuous") { + return(.generate_native_continuous(variable, n)) + } + if (variable$type == "categorical") { + return(.generate_native_categorical(variable, n)) + } + if (variable$type == "date") { + return(.generate_native_date(variable, n)) + } + + stop( + "Native backend does not support variable type '", variable$type, + "' for variable '", variable$name, "'.", + call. = FALSE + ) +} + +#' Generate mock data with the native backend +#' +#' `generate_mock_data_native()` consumes a validated `mock_spec` and generates +#' baseline valid values using MockData's native R backend. This milestone does +#' not yet apply missing-code injection, garbage values, diagnostics, formula +#' evaluation, or optional `simstudy` features. +#' +#' @param spec A `mock_spec` object. +#' @param n Non-negative whole number of rows to generate. +#' @param seed Optional whole-number random seed. The previous R random state is +#' restored after generation. +#' +#' @return A data frame with one column per `mock_spec` variable and `n` rows. +#' @family mock generation APIs +#' @seealso [mock_spec()], [mock_continuous()], [mock_spec_from_recodeflow()] +#' +#' @examples +#' spec <- mock_spec( +#' mock_spec_continuous("age", range = c(18, 85), rtype = "integer"), +#' mock_spec_categorical( +#' "smoking", +#' levels = c("never", "former", "current"), +#' proportions = c(0.5, 0.3, 0.2) +#' ) +#' ) +#' data <- generate_mock_data_native(spec, n = 10, seed = 1) +#' head(data) +#' +#' @export +generate_mock_data_native <- function(spec, n, seed = NULL) { + validate_mock_spec(spec, n = n, strict = TRUE) + + .with_mock_seed(seed, { + if (length(spec$variables) == 0) { + .empty_native_data(n) + } else { + columns <- lapply(spec$variables, .generate_native_variable, n = n) + names(columns) <- names(spec$variables) + as.data.frame(columns, stringsAsFactors = FALSE, check.names = FALSE) + } + }) +} diff --git a/tests/testthat/test-native-backend.R b/tests/testthat/test-native-backend.R new file mode 100644 index 0000000..7b3c56d --- /dev/null +++ b/tests/testthat/test-native-backend.R @@ -0,0 +1,123 @@ +test_that("generate_mock_data_native generates baseline direct specs", { + spec <- mock_spec( + mock_spec_continuous( + "age", + range = c(18, 85), + distribution = "normal", + mean = 50, + sd = 12, + rtype = "integer" + ), + mock_spec_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + rtype = "character" + ), + mock_spec_date( + "interview_date", + range = as.Date(c("2001-01-01", "2005-12-31")) + ) + ) + + result <- generate_mock_data_native(spec, n = 500, seed = 101) + + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 500) + expect_named(result, c("age", "smoking", "interview_date")) + expect_true(all(result$age >= 18 & result$age <= 85)) + expect_type(result$age, "integer") + expect_true(all(result$smoking %in% c("never", "former", "current"))) + expect_s3_class(result$interview_date, "Date") + expect_true(all(result$interview_date >= as.Date("2001-01-01"))) + expect_true(all(result$interview_date <= as.Date("2005-12-31"))) +}) + +test_that("generate_mock_data_native is reproducible without leaking RNG state", { + spec <- mock_continuous("age", range = c(18, 85), rtype = "integer") + + set.seed(999) + before <- runif(1) + result_1 <- generate_mock_data_native(spec, n = 10, seed = 42) + after_1 <- runif(1) + + set.seed(999) + expect_equal(runif(1), before) + result_2 <- generate_mock_data_native(spec, n = 10, seed = 42) + after_2 <- runif(1) + + expect_equal(result_1, result_2) + expect_equal(after_1, after_2) +}) + +test_that("generate_mock_data_native handles empty specs and n = 0", { + empty <- generate_mock_data_native(mock_spec(), n = 5, seed = 1) + expect_s3_class(empty, "data.frame") + expect_equal(nrow(empty), 5) + expect_equal(ncol(empty), 0) + + spec <- mock_categorical("smoking", levels = c("never", "former", "current")) + zero <- generate_mock_data_native(spec, n = 0, seed = 1) + expect_equal(nrow(zero), 0) + expect_named(zero, "smoking") +}) + +test_that("generate_mock_data_native consumes simple recodeflow specs", { + variables <- data.frame( + variable = c("age", "smoking", "interview_date"), + variableType = c("Continuous", "Categorical", "Continuous"), + rType = c("integer", "character", "date"), + role = "enabled", + distribution = c("uniform", "", "uniform"), + stringsAsFactors = FALSE + ) + details <- data.frame( + variable = c("age", "smoking", "smoking", "interview_date"), + recStart = c("[18, 85]", "never", "current", "[2001-01-01,2001-01-31]"), + recEnd = c("copy", "never", "current", "copy"), + proportion = c(1, 0.75, 0.25, 1), + stringsAsFactors = FALSE + ) + + spec <- mock_spec_from_recodeflow(variables, details) + result <- generate_mock_data_native(spec, n = 100, seed = 55) + + expect_named(result, c("age", "smoking", "interview_date")) + expect_true(all(result$age >= 18 & result$age <= 85)) + expect_true(all(result$smoking %in% c("never", "current"))) + expect_s3_class(result$interview_date, "Date") +}) + +test_that("generate_mock_data_native fails loudly for unsupported native features", { + survival_like <- mock_spec( + mock_spec_date( + "event_date", + range = as.Date(c("2001-01-01", "2005-12-31")) + ), + validate = FALSE + ) + survival_like$variables$event_date$distribution <- "gompertz" + + expect_error( + generate_mock_data_native(survival_like, n = 10), + "does not yet support date distribution" + ) + + expect_error( + generate_mock_data_native(list(), n = 10), + "mock_spec" + ) +}) + +test_that("generate_mock_data_native rejects lossy categorical coercion", { + spec <- mock_categorical( + "smoking", + levels = c("never", "former", "current"), + rtype = "integer" + ) + + expect_error( + generate_mock_data_native(spec, n = 10, seed = 1), + "integer-like levels" + ) +}) From 501698b6ae70a865af84b29afe8b1206d28dd819 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 11:33:49 -0400 Subject: [PATCH 08/18] Harden native mock_spec backend --- R/mock_spec_native.R | 37 ++++++++++- tests/testthat/test-native-backend.R | 93 ++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 3 deletions(-) diff --git a/R/mock_spec_native.R b/R/mock_spec_native.R index 1e0d812..990cbf2 100644 --- a/R/mock_spec_native.R +++ b/R/mock_spec_native.R @@ -42,7 +42,30 @@ sample.int(n_levels, size = n, replace = TRUE, prob = prob) } -.native_truncated_normal <- function(n, mean, sd, range) { +.native_formula_variables <- function(spec) { + names(Filter(function(variable) { + formula <- variable$formula + !is.null(formula) && + !(is.character(formula) && length(formula) == 1 && (is.na(formula) || trimws(formula) == "")) + }, spec$variables)) +} + +.check_native_backend_scope <- function(spec) { + formula_variables <- .native_formula_variables(spec) + if (length(formula_variables) > 0) { + stop( + "Formula evaluation is not yet implemented in the M4 native backend. ", + "Formula variable(s): ", + paste(formula_variables, collapse = ", "), + ". Expected in a later formula/dependency milestone.", + call. = FALSE + ) + } + + invisible(TRUE) +} + +.native_truncated_normal <- function(n, mean, sd, range, variable_name) { if (n == 0) { return(numeric(0)) } @@ -62,7 +85,8 @@ if (length(remaining) > 0) { warning( - "Could not fill all truncated-normal values by rejection sampling; ", + "Variable '", variable_name, + "': could not fill all truncated-normal values by rejection sampling; ", "using uniform draws for the remaining values.", call. = FALSE ) @@ -155,7 +179,13 @@ if (distribution == "uniform") { values <- stats::runif(n, variable$range[[1]], variable$range[[2]]) } else if (distribution == "normal") { - values <- .native_truncated_normal(n, variable$mean, variable$sd, variable$range) + values <- .native_truncated_normal( + n, + variable$mean, + variable$sd, + variable$range, + variable$name + ) } else { stop( "Native backend does not yet support continuous distribution '", @@ -251,6 +281,7 @@ #' @export generate_mock_data_native <- function(spec, n, seed = NULL) { validate_mock_spec(spec, n = n, strict = TRUE) + .check_native_backend_scope(spec) .with_mock_seed(seed, { if (length(spec$variables) == 0) { diff --git a/tests/testthat/test-native-backend.R b/tests/testthat/test-native-backend.R index 7b3c56d..9904878 100644 --- a/tests/testthat/test-native-backend.R +++ b/tests/testthat/test-native-backend.R @@ -33,6 +33,39 @@ test_that("generate_mock_data_native generates baseline direct specs", { expect_true(all(result$interview_date <= as.Date("2005-12-31"))) }) +test_that("generate_mock_data_native preserves statistical contracts", { + spec <- mock_spec( + mock_spec_continuous("uniform_age", range = c(20, 80), rtype = "double"), + mock_spec_continuous( + "normal_age", + range = c(18, 85), + distribution = "normal", + mean = 50, + sd = 12, + rtype = "double" + ), + mock_spec_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + rtype = "character" + ) + ) + + result <- generate_mock_data_native(spec, n = 5000, seed = 202) + + expect_equal(mean(result$uniform_age), 50, tolerance = 1) + expect_equal(stats::sd(result$uniform_age), 60 / sqrt(12), tolerance = 1) + expect_equal(mean(result$normal_age), 50, tolerance = 1) + expect_equal(stats::sd(result$normal_age), 12, tolerance = 1) + + observed <- prop.table(table(factor( + result$smoking, + levels = c("never", "former", "current") + ))) + expect_equal(as.numeric(observed), c(0.5, 0.3, 0.2), tolerance = 0.03) +}) + test_that("generate_mock_data_native is reproducible without leaking RNG state", { spec <- mock_continuous("age", range = c(18, 85), rtype = "integer") @@ -60,6 +93,22 @@ test_that("generate_mock_data_native handles empty specs and n = 0", { zero <- generate_mock_data_native(spec, n = 0, seed = 1) expect_equal(nrow(zero), 0) expect_named(zero, "smoking") + + continuous_zero <- generate_mock_data_native( + mock_continuous("age", range = c(18, 85)), + n = 0, + seed = 1 + ) + expect_equal(nrow(continuous_zero), 0) + expect_named(continuous_zero, "age") + + one <- generate_mock_data_native( + mock_continuous("age", range = c(18, 85), rtype = "integer"), + n = 1, + seed = 1 + ) + expect_equal(nrow(one), 1) + expect_true(one$age >= 18 && one$age <= 85) }) test_that("generate_mock_data_native consumes simple recodeflow specs", { @@ -86,6 +135,23 @@ test_that("generate_mock_data_native consumes simple recodeflow specs", { expect_true(all(result$age >= 18 & result$age <= 85)) expect_true(all(result$smoking %in% c("never", "current"))) expect_s3_class(result$interview_date, "Date") + + direct_spec <- mock_spec( + mock_spec_continuous("age", range = c(18, 85), rtype = "integer"), + mock_spec_categorical( + "smoking", + levels = c("never", "current"), + proportions = c(0.75, 0.25), + rtype = "character" + ), + mock_spec_date( + "interview_date", + range = as.Date(c("2001-01-01", "2001-01-31")) + ) + ) + + direct_result <- generate_mock_data_native(direct_spec, n = 100, seed = 55) + expect_equal(result, direct_result) }) test_that("generate_mock_data_native fails loudly for unsupported native features", { @@ -109,6 +175,33 @@ test_that("generate_mock_data_native fails loudly for unsupported native feature ) }) +test_that("generate_mock_data_native rejects formula specs until evaluator milestone", { + variable <- mock_spec_continuous("bmi", range = c(15, 50)) + variable$formula <- "weight / height^2" + spec <- mock_spec(variable) + + expect_error( + generate_mock_data_native(spec, n = 10, seed = 1), + "Formula evaluation is not yet implemented" + ) +}) + +test_that("generate_mock_data_native warns on truncated normal fallback", { + spec <- mock_spec(mock_spec_continuous( + "age", + range = c(0, 1), + distribution = "normal", + mean = 1000, + sd = 1 + )) + + expect_warning( + result <- generate_mock_data_native(spec, n = 5, seed = 1), + "Variable 'age'" + ) + expect_true(all(result$age >= 0 & result$age <= 1)) +}) + test_that("generate_mock_data_native rejects lossy categorical coercion", { spec <- mock_categorical( "smoking", From fb6ae3f1aa865650623f55945c35ab2b95b96a91 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 19:26:23 -0400 Subject: [PATCH 09/18] Add mock_spec post-processing layer --- NAMESPACE | 1 + NEWS.md | 3 + R/mock_spec_postprocess.R | 312 ++++++++++++++++++++ tests/testthat/test-mock-spec-postprocess.R | 162 ++++++++++ 4 files changed, 478 insertions(+) create mode 100644 R/mock_spec_postprocess.R create mode 100644 tests/testthat/test-mock-spec-postprocess.R diff --git a/NAMESPACE b/NAMESPACE index 76b5988..52cae5a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -37,6 +37,7 @@ export(mock_spec_date) export(mock_spec_from_recodeflow) export(parse_range_notation) export(parse_variable_start) +export(postprocess_mock_data) export(read_mock_data_config) export(read_mock_data_config_details) export(sample_with_proportions) diff --git a/NEWS.md b/NEWS.md index 5e14789..fe6fda1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,9 @@ survival/date fields. - Added `generate_mock_data_native()` to generate baseline valid mock data from `mock_spec` objects with the native R backend. +- Added `postprocess_mock_data()` to apply `mock_spec` missing-code and + garbage-value rules after baseline generation, with diagnostics that + distinguish assigned missing/garbage rows from naturally drawn values. - Added forward-compatible specification fields: `spec_version`, `provenance`, and `model_hint`. - Existing v0.3 generator APIs remain available while v0.4 internals are built. diff --git a/R/mock_spec_postprocess.R b/R/mock_spec_postprocess.R new file mode 100644 index 0000000..f71a6d5 --- /dev/null +++ b/R/mock_spec_postprocess.R @@ -0,0 +1,312 @@ +# ============================================================================== +# MockData v0.4 Post-processing Layer +# ============================================================================== +# Applies missing-code and garbage-value rules after baseline generation while +# preserving diagnostics that distinguish assigned states from coincidental +# value collisions. +# ============================================================================== + +.postprocess_empty_diagnostics <- function(spec, n) { + variables <- lapply(spec$variables, function(variable) { + list( + n = n, + preexisting_missing_code_indices = integer(0), + assigned_missing_indices = integer(0), + assigned_missing_codes = character(0), + assigned_garbage_indices = list(low = integer(0), high = integer(0)), + assigned_garbage_values = list(low = character(0), high = character(0)) + ) + }) + + list( + spec_version = spec$spec_version, + variables = variables + ) +} + +.values_match_codes <- function(values, codes) { + if (length(codes) == 0) { + return(rep(FALSE, length(values))) + } + + as.character(values) %in% as.character(codes) +} + +.sample_postprocess_indices <- function(candidates, n, avoid = integer(0)) { + if (n == 0) { + return(integer(0)) + } + if (length(candidates) < n) { + stop("Not enough candidate rows are available for post-processing.", call. = FALSE) + } + + preferred <- setdiff(candidates, avoid) + if (length(preferred) >= n) { + return(.sample_values(preferred, n)) + } + + c( + preferred, + .sample_values(setdiff(candidates, preferred), n - length(preferred)) + ) +} + +.coerce_postprocess_values <- function(values, variable, target) { + if (inherits(target, "factor")) { + return(as.character(values)) + } + + if (inherits(target, "Date") || variable$rtype == "date") { + converted <- as.Date(values) + if (any(is.na(converted) & !is.na(values))) { + stop("Variable '", variable$name, "' has date post-processing values that cannot be parsed.", call. = FALSE) + } + return(converted) + } + + if (is.integer(target) || variable$rtype == "integer") { + converted <- suppressWarnings(as.integer(round(as.numeric(values)))) + if (any(is.na(converted) & !is.na(values))) { + stop("Variable '", variable$name, "' has integer post-processing values that cannot be parsed.", call. = FALSE) + } + return(converted) + } + + if (is.numeric(target) || variable$rtype %in% c("double", "numeric")) { + converted <- suppressWarnings(as.numeric(values)) + if (any(is.na(converted) & !is.na(values))) { + stop("Variable '", variable$name, "' has numeric post-processing values that cannot be parsed.", call. = FALSE) + } + return(converted) + } + + if (is.logical(target) || variable$rtype == "logical") { + value_chr <- as.character(values) + if (!all(value_chr %in% c("TRUE", "FALSE", "true", "false", "1", "0"))) { + stop("Variable '", variable$name, "' has logical post-processing values that cannot be parsed.", call. = FALSE) + } + return(value_chr %in% c("TRUE", "true", "1")) + } + + as.character(values) +} + +.assign_postprocess_values <- function(target, indices, values) { + if (length(indices) == 0) { + return(target) + } + + if (inherits(target, "factor")) { + missing_levels <- setdiff(as.character(values), levels(target)) + if (length(missing_levels) > 0) { + levels(target) <- c(levels(target), missing_levels) + } + } + + target[indices] <- values + target +} + +.generate_garbage_for_rule <- function(rule, variable, n) { + if (n == 0) { + return(vector(mode = "character", length = 0)) + } + + parsed <- parse_range_notation(rule$range) + if (is.null(parsed)) { + stop( + "Variable '", variable$name, "' has an invalid garbage range: ", + rule$range, + call. = FALSE + ) + } + + if (identical(parsed$type, "date")) { + date_values <- seq(parsed$min, parsed$max, by = "day") + return(.sample_values(date_values, n, replace = TRUE)) + } + + if (identical(parsed$type, "integer") && !is.null(parsed$values)) { + return(.sample_values(parsed$values, n, replace = TRUE)) + } + + values <- stats::runif(n, parsed$min, parsed$max) + if (variable$type == "categorical" || variable$rtype == "integer") { + values <- round(values) + } + + values +} + +.postprocess_missing <- function(values, variable, diagnostics) { + if (length(variable$missing_codes) == 0) { + return(list(values = values, diagnostics = diagnostics)) + } + + available <- seq_along(values) + preexisting <- which(.values_match_codes(values, variable$missing_codes)) + assigned <- integer(0) + assigned_codes <- character(0) + + for (i in seq_along(variable$missing_codes)) { + proportion <- variable$missing_proportions[[i]] + n_assign <- round(length(values) * proportion) + if (n_assign == 0) { + next + } + + code <- variable$missing_codes[[i]] + code_values <- rep(code, n_assign) + assign_idx <- .sample_postprocess_indices( + available, + n_assign, + avoid = union(preexisting, assigned) + ) + coerced <- .coerce_postprocess_values(code_values, variable, values) + values <- .assign_postprocess_values(values, assign_idx, coerced) + + available <- setdiff(available, assign_idx) + assigned <- c(assigned, assign_idx) + assigned_codes <- c(assigned_codes, as.character(code_values)) + } + + diagnostics$preexisting_missing_code_indices <- preexisting + diagnostics$assigned_missing_indices <- assigned + diagnostics$assigned_missing_codes <- assigned_codes + + list(values = values, diagnostics = diagnostics) +} + +.postprocess_garbage <- function(values, variable, diagnostics) { + if (length(variable$garbage_rules) == 0) { + return(list(values = values, diagnostics = diagnostics)) + } + if (is.null(names(variable$garbage_rules)) || any(names(variable$garbage_rules) == "")) { + stop("Variable '", variable$name, "' garbage_rules must be a named list.", call. = FALSE) + } + + assigned_missing <- diagnostics$assigned_missing_indices + valid_idx <- setdiff(which(!is.na(values)), assigned_missing) + remaining_idx <- valid_idx + + requested <- vapply(variable$garbage_rules, function(rule) { + proportion <- rule$proportion %||% 0 + if (is.na(proportion)) { + proportion <- 0 + } + as.integer(round(length(valid_idx) * proportion)) + }, integer(1)) + + if (sum(requested) > length(valid_idx)) { + stop( + "Variable '", variable$name, + "' garbage rules request more rows than are available after missing-code assignment.", + call. = FALSE + ) + } + + for (rule_name in names(variable$garbage_rules)) { + rule <- variable$garbage_rules[[rule_name]] + n_assign <- requested[[rule_name]] + if (n_assign == 0) { + next + } + + if (is.null(rule$range) || is.na(rule$range) || trimws(rule$range) == "") { + stop("Variable '", variable$name, "' garbage rule '", rule_name, "' is missing a range.", call. = FALSE) + } + + assign_idx <- .sample_postprocess_indices(remaining_idx, n_assign) + raw_values <- .generate_garbage_for_rule(rule, variable, n_assign) + coerced <- .coerce_postprocess_values(raw_values, variable, values) + values <- .assign_postprocess_values(values, assign_idx, coerced) + + diagnostics$assigned_garbage_indices[[rule_name]] <- assign_idx + diagnostics$assigned_garbage_values[[rule_name]] <- coerced + remaining_idx <- setdiff(remaining_idx, assign_idx) + } + + list(values = values, diagnostics = diagnostics) +} + +.postprocess_variable <- function(values, variable, diagnostics) { + missing_result <- .postprocess_missing(values, variable, diagnostics) + garbage_result <- .postprocess_garbage( + missing_result$values, + variable, + missing_result$diagnostics + ) + + garbage_result +} + +#' Apply mock_spec post-processing rules +#' +#' `postprocess_mock_data()` applies v0.4 `mock_spec` missing-code and +#' garbage-value rules to an already generated baseline data frame. It records a +#' `mockdata_diagnostics` attribute so downstream checks can distinguish values +#' assigned by post-processing from values that were drawn naturally by the +#' baseline generator. +#' +#' @param data Data frame with one column for each variable in `spec`. +#' @param spec A validated `mock_spec` object. +#' @param seed Optional whole-number random seed. The previous R random state is +#' restored after post-processing. +#' @param diagnostics Logical. If `TRUE`, attach a `mockdata_diagnostics` +#' attribute to the returned data frame. +#' +#' @return A data frame with post-processing applied. +#' @family mock generation APIs +#' @seealso [generate_mock_data_native()], [mock_spec()] +#' +#' @examples +#' spec <- mock_categorical( +#' "smoking", +#' levels = c("never", "former", "current"), +#' proportions = c(0.5, 0.3, 0.2), +#' rtype = "character", +#' missing_codes = "9", +#' missing_proportions = 0.05 +#' ) +#' baseline <- generate_mock_data_native(spec, n = 20, seed = 1) +#' result <- postprocess_mock_data(baseline, spec, seed = 2) +#' attr(result, "mockdata_diagnostics")$variables$smoking +#' +#' @export +postprocess_mock_data <- function(data, spec, seed = NULL, diagnostics = TRUE) { + if (!is.data.frame(data)) { + stop("data must be a data frame.", call. = FALSE) + } + validate_mock_spec(spec, n = nrow(data), strict = TRUE) + + missing_columns <- setdiff(names(spec$variables), names(data)) + if (length(missing_columns) > 0) { + stop( + "data is missing column(s) required by spec: ", + paste(missing_columns, collapse = ", "), + call. = FALSE + ) + } + + .with_mock_seed(seed, { + output <- data + diag <- .postprocess_empty_diagnostics(spec, nrow(data)) + + for (variable_name in names(spec$variables)) { + variable <- spec$variables[[variable_name]] + result <- .postprocess_variable( + output[[variable_name]], + variable, + diag$variables[[variable_name]] + ) + output[[variable_name]] <- result$values + diag$variables[[variable_name]] <- result$diagnostics + } + + if (isTRUE(diagnostics)) { + attr(output, "mockdata_diagnostics") <- diag + } + + output + }) +} diff --git a/tests/testthat/test-mock-spec-postprocess.R b/tests/testthat/test-mock-spec-postprocess.R new file mode 100644 index 0000000..b4dc87f --- /dev/null +++ b/tests/testthat/test-mock-spec-postprocess.R @@ -0,0 +1,162 @@ +test_that("postprocess_mock_data distinguishes missing-code collisions", { + spec <- mock_categorical( + "response", + levels = c("1", "97"), + proportions = c(0.7, 0.3), + rtype = "character", + missing_codes = "97", + missing_proportions = 0.2 + ) + baseline <- generate_mock_data_native(spec, n = 200, seed = 11) + + result <- postprocess_mock_data(baseline, spec, seed = 12) + diagnostics <- attr(result, "mockdata_diagnostics")$variables$response + + expect_true(length(diagnostics$preexisting_missing_code_indices) > 0) + expect_equal(length(diagnostics$assigned_missing_indices), 40) + expect_length(intersect( + diagnostics$preexisting_missing_code_indices, + diagnostics$assigned_missing_indices + ), 0) + expect_true(all(result$response[diagnostics$assigned_missing_indices] == "97")) + expect_true(any(baseline$response[diagnostics$preexisting_missing_code_indices] == "97")) +}) + +test_that("postprocess_mock_data applies integer missing and garbage rules", { + spec <- mock_continuous( + "age", + range = c(18, 85), + rtype = "integer", + missing_codes = 997, + missing_proportions = 0.1, + garbage_rules = list(high = list(proportion = 0.05, range = "[150, 200]")) + ) + baseline <- generate_mock_data_native(spec, n = 100, seed = 21) + + result <- postprocess_mock_data(baseline, spec, seed = 22) + diagnostics <- attr(result, "mockdata_diagnostics")$variables$age + high_idx <- diagnostics$assigned_garbage_indices$high + + expect_type(result$age, "integer") + expect_equal(length(diagnostics$assigned_missing_indices), 10) + expect_equal(result$age[diagnostics$assigned_missing_indices], rep(997L, 10)) + expect_equal(length(high_idx), round(90 * 0.05)) + expect_true(all(result$age[high_idx] >= 150L & result$age[high_idx] <= 200L)) + expect_length(intersect(high_idx, diagnostics$assigned_missing_indices), 0) +}) + +test_that("postprocess_mock_data preserves Date values", { + spec <- mock_date( + "interview_date", + range = as.Date(c("2001-01-01", "2001-01-31")), + missing_codes = "2099-01-01", + missing_proportions = 0.1, + garbage_rules = list(high = list( + proportion = 0.1, + range = "[2025-01-01, 2025-01-31]" + )) + ) + baseline <- generate_mock_data_native(spec, n = 50, seed = 31) + + result <- postprocess_mock_data(baseline, spec, seed = 32) + diagnostics <- attr(result, "mockdata_diagnostics")$variables$interview_date + high_idx <- diagnostics$assigned_garbage_indices$high + + expect_s3_class(result$interview_date, "Date") + expect_equal( + result$interview_date[diagnostics$assigned_missing_indices], + rep(as.Date("2099-01-01"), 5) + ) + expect_true(all(result$interview_date[high_idx] >= as.Date("2025-01-01"))) + expect_true(all(result$interview_date[high_idx] <= as.Date("2025-01-31"))) +}) + +test_that("postprocess_mock_data is reproducible without leaking RNG state", { + spec <- mock_continuous( + "age", + range = c(18, 85), + rtype = "integer", + missing_codes = 997, + missing_proportions = 0.1, + garbage_rules = list(high = list(proportion = 0.1, range = "[150, 200]")) + ) + baseline <- generate_mock_data_native(spec, n = 100, seed = 71) + + set.seed(999) + before <- runif(1) + result_1 <- postprocess_mock_data(baseline, spec, seed = 72) + after_1 <- runif(1) + + set.seed(999) + expect_equal(runif(1), before) + result_2 <- postprocess_mock_data(baseline, spec, seed = 72) + after_2 <- runif(1) + + expect_equal(result_1, result_2) + expect_equal(after_1, after_2) +}) + +test_that("postprocess_mock_data preserves and extends factor levels", { + spec <- mock_categorical( + "smoking", + levels = c("1", "2", "3"), + proportions = c(0.5, 0.3, 0.2), + missing_codes = "9", + missing_proportions = 0.1, + garbage_rules = list(low = list(proportion = 0.1, range = "[-2, 0]")) + ) + baseline <- generate_mock_data_native(spec, n = 60, seed = 41) + + result <- postprocess_mock_data(baseline, spec, seed = 42) + diagnostics <- attr(result, "mockdata_diagnostics")$variables$smoking + low_idx <- diagnostics$assigned_garbage_indices$low + + expect_s3_class(result$smoking, "factor") + expect_true("9" %in% levels(result$smoking)) + expect_true(all(as.character(result$smoking[diagnostics$assigned_missing_indices]) == "9")) + expect_true(all(as.character(result$smoking[low_idx]) %in% c("-2", "-1", "0"))) +}) + +test_that("postprocess_mock_data rejects unnamed garbage rules", { + spec <- mock_continuous( + "age", + range = c(18, 85), + garbage_rules = list(list(proportion = 0.1, range = "[150, 200]")) + ) + baseline <- generate_mock_data_native(spec, n = 10, seed = 81) + + expect_error( + postprocess_mock_data(baseline, spec, seed = 82), + "named list" + ) +}) + +test_that("postprocess_mock_data rejects impossible garbage requests", { + spec <- mock_continuous( + "age", + range = c(18, 85), + garbage_rules = list( + low = list(proportion = 0.8, range = "[-10, 0]"), + high = list(proportion = 0.8, range = "[150, 200]") + ) + ) + baseline <- generate_mock_data_native(spec, n = 10, seed = 51) + + expect_error( + postprocess_mock_data(baseline, spec, seed = 52), + "request more rows" + ) +}) + +test_that("postprocess_mock_data validates input shape and diagnostics opt-out", { + spec <- mock_continuous("age", range = c(18, 85)) + baseline <- generate_mock_data_native(spec, n = 10, seed = 61) + + expect_error( + postprocess_mock_data(data.frame(other = 1:10), spec), + "missing column" + ) + + result <- postprocess_mock_data(baseline, spec, diagnostics = FALSE) + expect_null(attr(result, "mockdata_diagnostics")) +}) From 6daef47cc411115d2b2d898591a48a2e9dde2cd7 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 19:50:53 -0400 Subject: [PATCH 10/18] Harden mock_spec post-processing diagnostics --- R/mock_spec_postprocess.R | 64 +++++++++++++++-- tests/testthat/test-mock-spec-postprocess.R | 76 ++++++++++++++++++++- 2 files changed, 133 insertions(+), 7 deletions(-) diff --git a/R/mock_spec_postprocess.R b/R/mock_spec_postprocess.R index f71a6d5..fc1b671 100644 --- a/R/mock_spec_postprocess.R +++ b/R/mock_spec_postprocess.R @@ -8,13 +8,27 @@ .postprocess_empty_diagnostics <- function(spec, n) { variables <- lapply(spec$variables, function(variable) { + garbage_rule_names <- names(variable$garbage_rules) + if (is.null(garbage_rule_names)) { + garbage_rule_names <- character(0) + } + garbage_rule_names <- .ordered_garbage_rule_names(garbage_rule_names) + garbage_indices <- stats::setNames( + rep(list(integer(0)), length(garbage_rule_names)), + garbage_rule_names + ) + garbage_values <- stats::setNames( + rep(list(character(0)), length(garbage_rule_names)), + garbage_rule_names + ) + list( n = n, preexisting_missing_code_indices = integer(0), assigned_missing_indices = integer(0), assigned_missing_codes = character(0), - assigned_garbage_indices = list(low = integer(0), high = integer(0)), - assigned_garbage_values = list(low = character(0), high = character(0)) + assigned_garbage_indices = garbage_indices, + assigned_garbage_values = garbage_values ) }) @@ -138,10 +152,21 @@ values } +.ordered_garbage_rule_names <- function(rule_names) { + c(intersect(c("low", "high"), rule_names), setdiff(rule_names, c("low", "high"))) +} + .postprocess_missing <- function(values, variable, diagnostics) { if (length(variable$missing_codes) == 0) { return(list(values = values, diagnostics = diagnostics)) } + if (sum(variable$missing_proportions) > 1 + .mock_spec_probability_tolerance) { + stop( + "Variable '", variable$name, + "' missing proportions request more rows than are available.", + call. = FALSE + ) + } available <- seq_along(values) preexisting <- which(.values_match_codes(values, variable$missing_codes)) @@ -182,11 +207,25 @@ return(list(values = values, diagnostics = diagnostics)) } if (is.null(names(variable$garbage_rules)) || any(names(variable$garbage_rules) == "")) { - stop("Variable '", variable$name, "' garbage_rules must be a named list.", call. = FALSE) + rule_names <- names(variable$garbage_rules) + unnamed <- if (is.null(rule_names)) { + seq_along(variable$garbage_rules) + } else { + which(rule_names == "") + } + stop( + "Variable '", variable$name, "' garbage_rules must be a named list; ", + "unnamed rule index: ", paste(unnamed, collapse = ", "), + ".", + call. = FALSE + ) } - assigned_missing <- diagnostics$assigned_missing_indices - valid_idx <- setdiff(which(!is.na(values)), assigned_missing) + protected_idx <- union( + diagnostics$assigned_missing_indices, + diagnostics$preexisting_missing_code_indices + ) + valid_idx <- setdiff(which(!is.na(values)), protected_idx) remaining_idx <- valid_idx requested <- vapply(variable$garbage_rules, function(rule) { @@ -205,7 +244,7 @@ ) } - for (rule_name in names(variable$garbage_rules)) { + for (rule_name in .ordered_garbage_rule_names(names(variable$garbage_rules))) { rule <- variable$garbage_rules[[rule_name]] n_assign <- requested[[rule_name]] if (n_assign == 0) { @@ -256,6 +295,12 @@ #' attribute to the returned data frame. #' #' @return A data frame with post-processing applied. +#' +#' @details +#' Diagnostics are stored as a data-frame attribute. Base R subsetting and some +#' downstream tools may drop attributes, so preserve the original post-processed +#' object when diagnostics are part of the audit trail. +#' #' @family mock generation APIs #' @seealso [generate_mock_data_native()], [mock_spec()] #' @@ -277,6 +322,13 @@ postprocess_mock_data <- function(data, spec, seed = NULL, diagnostics = TRUE) { if (!is.data.frame(data)) { stop("data must be a data frame.", call. = FALSE) } + if (!is.null(attr(data, "mockdata_diagnostics"))) { + stop( + "postprocess_mock_data() appears to have already run on this data. ", + "Start from baseline generated data to avoid double post-processing.", + call. = FALSE + ) + } validate_mock_spec(spec, n = nrow(data), strict = TRUE) missing_columns <- setdiff(names(spec$variables), names(data)) diff --git a/tests/testthat/test-mock-spec-postprocess.R b/tests/testthat/test-mock-spec-postprocess.R index b4dc87f..cd942b4 100644 --- a/tests/testthat/test-mock-spec-postprocess.R +++ b/tests/testthat/test-mock-spec-postprocess.R @@ -22,6 +22,47 @@ test_that("postprocess_mock_data distinguishes missing-code collisions", { expect_true(any(baseline$response[diagnostics$preexisting_missing_code_indices] == "97")) }) +test_that("postprocess_mock_data does not overwrite preexisting missing-code collisions with garbage", { + spec <- mock_categorical( + "response", + levels = c("1", "97"), + proportions = c(0.5, 0.5), + rtype = "character", + missing_codes = "97", + missing_proportions = 0, + garbage_rules = list(low = list(proportion = 1, range = "[-2, 0]")) + ) + baseline <- generate_mock_data_native(spec, n = 100, seed = 13) + expect_true(any(baseline$response == "97")) + + result <- postprocess_mock_data(baseline, spec, seed = 14) + diagnostics <- attr(result, "mockdata_diagnostics")$variables$response + + preexisting <- diagnostics$preexisting_missing_code_indices + garbage <- diagnostics$assigned_garbage_indices$low + + expect_length(intersect(preexisting, garbage), 0) + expect_true(all(result$response[preexisting] == "97")) +}) + +test_that("postprocess_mock_data rejects missing-proportion overflow", { + spec <- mock_categorical( + "response", + levels = c("1", "2"), + proportions = c(0.5, 0.5), + rtype = "character", + missing_codes = c("97", "98"), + missing_proportions = c(0.1, 0.1) + ) + spec$variables$response$missing_proportions <- c(0.6, 0.6) + baseline <- data.frame(response = rep(c("1", "2"), each = 10)) + + expect_error( + postprocess_mock_data(baseline, spec, seed = 16), + "missing proportions must sum" + ) +}) + test_that("postprocess_mock_data applies integer missing and garbage rules", { spec <- mock_continuous( "age", @@ -117,6 +158,23 @@ test_that("postprocess_mock_data preserves and extends factor levels", { expect_true(all(as.character(result$smoking[low_idx]) %in% c("-2", "-1", "0"))) }) +test_that("postprocess_mock_data applies garbage rules in canonical order", { + spec <- mock_continuous( + "age", + range = c(18, 85), + garbage_rules = list( + high = list(proportion = 0.1, range = "[150, 200]"), + low = list(proportion = 0.1, range = "[-10, 0]") + ) + ) + baseline <- generate_mock_data_native(spec, n = 20, seed = 81) + + result <- postprocess_mock_data(baseline, spec, seed = 82) + diagnostics <- attr(result, "mockdata_diagnostics")$variables$age + + expect_equal(names(diagnostics$assigned_garbage_indices), c("low", "high")) +}) + test_that("postprocess_mock_data rejects unnamed garbage rules", { spec <- mock_continuous( "age", @@ -127,7 +185,7 @@ test_that("postprocess_mock_data rejects unnamed garbage rules", { expect_error( postprocess_mock_data(baseline, spec, seed = 82), - "named list" + "unnamed rule index: 1" ) }) @@ -160,3 +218,19 @@ test_that("postprocess_mock_data validates input shape and diagnostics opt-out", result <- postprocess_mock_data(baseline, spec, diagnostics = FALSE) expect_null(attr(result, "mockdata_diagnostics")) }) + +test_that("postprocess_mock_data rejects idempotent re-call", { + spec <- mock_continuous( + "age", + range = c(18, 85), + missing_codes = 997, + missing_proportions = 0.1 + ) + baseline <- generate_mock_data_native(spec, n = 20, seed = 91) + result <- postprocess_mock_data(baseline, spec, seed = 92) + + expect_error( + postprocess_mock_data(result, spec, seed = 93), + "already run" + ) +}) From 2b6aa1e2aedf7e41d883c10a54b990532863d47e Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 18 May 2026 19:55:11 -0400 Subject: [PATCH 11/18] Promote mock_spec pipeline assertions --- tests/testthat/test-mock-spec-pipeline.R | 145 +++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 tests/testthat/test-mock-spec-pipeline.R diff --git a/tests/testthat/test-mock-spec-pipeline.R b/tests/testthat/test-mock-spec-pipeline.R new file mode 100644 index 0000000..a53cf26 --- /dev/null +++ b/tests/testthat/test-mock-spec-pipeline.R @@ -0,0 +1,145 @@ +run_native_pipeline <- function(spec, n, seed) { + baseline <- generate_mock_data_native(spec, n = n, seed = seed) + postprocess_mock_data(baseline, spec, seed = seed + 1) +} + +test_that("native mock_spec pipeline preserves categorical codes and diagnostics", { + spec <- mock_categorical( + "response", + levels = c("1", "97"), + proportions = c(0.55, 0.45), + rtype = "character", + missing_codes = "97", + missing_proportions = 0.2, + garbage_rules = list(low = list(proportion = 0.4, range = "[-2, 0]")) + ) + + result <- run_native_pipeline(spec, n = 200, seed = 101) + diagnostics <- attr(result, "mockdata_diagnostics")$variables$response + + expect_true(all(result$response %in% c("1", "97", "-2", "-1", "0"))) + expect_equal(length(diagnostics$assigned_missing_indices), 40) + expect_true(length(diagnostics$preexisting_missing_code_indices) > 0) + expect_length(intersect( + diagnostics$preexisting_missing_code_indices, + diagnostics$assigned_missing_indices + ), 0) + expect_length(intersect( + diagnostics$preexisting_missing_code_indices, + diagnostics$assigned_garbage_indices$low + ), 0) + expect_true(all(result$response[diagnostics$preexisting_missing_code_indices] == "97")) + expect_true(all(result$response[diagnostics$assigned_missing_indices] == "97")) +}) + +test_that("native mock_spec pipeline is reproducible as a composed workflow", { + spec <- mock_spec( + mock_spec_continuous( + "age", + range = c(18, 85), + distribution = "normal", + mean = 50, + sd = 12, + rtype = "integer", + missing_codes = 997, + missing_proportions = 0.05, + garbage_rules = list(high = list(proportion = 0.05, range = "[150, 200]")) + ), + mock_spec_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + rtype = "character" + ) + ) + + first <- run_native_pipeline(spec, n = 100, seed = 202) + second <- run_native_pipeline(spec, n = 100, seed = 202) + + expect_equal(first, second) + expect_equal( + attr(first, "mockdata_diagnostics"), + attr(second, "mockdata_diagnostics") + ) +}) + +test_that("recodeflow pipeline preserves recEnd-driven missingness and garbage", { + variables <- data.frame( + variable = "smoking", + variableType = "Categorical", + rType = "character", + role = "enabled", + garbage_low_prop = 0.1, + garbage_low_range = "[-2, 0]", + stringsAsFactors = FALSE + ) + variable_details <- data.frame( + variable = "smoking", + recStart = c("1", "2", "97", "99"), + recEnd = c("copy", "copy", "NA::b", "NA::b"), + proportion = c(0.5, 0.3, 0.1, 0.1), + stringsAsFactors = FALSE + ) + spec <- mock_spec_from_recodeflow(variables, variable_details) + + result <- run_native_pipeline(spec, n = 100, seed = 303) + diagnostics <- attr(result, "mockdata_diagnostics")$variables$smoking + + expect_equal(spec$variables$smoking$levels, c("1", "2")) + expect_equal(spec$variables$smoking$missing_codes, c("97", "99")) + expect_equal(length(diagnostics$assigned_missing_indices), 20) + expect_equal(length(diagnostics$assigned_garbage_indices$low), 8) + expect_true(all(result$smoking[diagnostics$assigned_missing_indices] %in% c("97", "99"))) + expect_true(all(result$smoking[diagnostics$assigned_garbage_indices$low] %in% c("-2", "-1", "0"))) + expect_length(intersect( + diagnostics$assigned_missing_indices, + diagnostics$assigned_garbage_indices$low + ), 0) +}) + +test_that("direct and recodeflow pipelines agree for equivalent specs", { + direct_spec <- mock_categorical( + "smoking", + levels = c("1", "2"), + proportions = c(0.625, 0.375), + rtype = "character", + missing_codes = c("97", "99"), + missing_proportions = c(0.1, 0.1), + garbage_rules = list(low = list(proportion = 0.1, range = "[-2, 0]")) + ) + variables <- data.frame( + variable = "smoking", + variableType = "Categorical", + rType = "character", + role = "enabled", + garbage_low_prop = 0.1, + garbage_low_range = "[-2, 0]", + stringsAsFactors = FALSE + ) + variable_details <- data.frame( + variable = "smoking", + recStart = c("1", "2", "97", "99"), + recEnd = c("copy", "copy", "NA::b", "NA::b"), + proportion = c(0.5, 0.3, 0.1, 0.1), + stringsAsFactors = FALSE + ) + recodeflow_spec <- mock_spec_from_recodeflow(variables, variable_details) + + direct_result <- run_native_pipeline(direct_spec, n = 100, seed = 404) + recodeflow_result <- run_native_pipeline(recodeflow_spec, n = 100, seed = 404) + + attr(direct_result, "mockdata_diagnostics") <- NULL + attr(recodeflow_result, "mockdata_diagnostics") <- NULL + expect_equal(direct_result, recodeflow_result) +}) + +test_that("pipeline keeps deferred formula variables loud", { + variable <- mock_spec_continuous("bmi", range = c(15, 50)) + variable$formula <- "weight / height^2" + spec <- mock_spec(variable) + + expect_error( + run_native_pipeline(spec, n = 10, seed = 505), + "Formula evaluation is not yet implemented" + ) +}) From c28354a2ef1bfa273d8488cb0c60e0ea3e1ec81e Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Tue, 19 May 2026 19:37:32 -0400 Subject: [PATCH 12/18] Add optional simstudy mock_spec backend --- DESCRIPTION | 1 + NAMESPACE | 1 + NEWS.md | 3 + R/mock_spec_simstudy.R | 169 +++++++++++++++++++++++++ tests/testthat/test-simstudy-backend.R | 108 ++++++++++++++++ 5 files changed, 282 insertions(+) create mode 100644 R/mock_spec_simstudy.R create mode 100644 tests/testthat/test-simstudy-backend.R diff --git a/DESCRIPTION b/DESCRIPTION index 6a79c59..3f5a388 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,6 +32,7 @@ Suggests: readr, stringr, lubridate, + simstudy, knitr, quarto, devtools, diff --git a/NAMESPACE b/NAMESPACE index 52cae5a..987afd8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(extract_distribution_params) export(extract_proportions) export(generate_garbage_values) export(generate_mock_data_native) +export(generate_mock_data_simstudy) export(get_cycle_variables) export(get_enabled_variables) export(get_raw_var_dependencies) diff --git a/NEWS.md b/NEWS.md index fe6fda1..f5aa558 100644 --- a/NEWS.md +++ b/NEWS.md @@ -19,6 +19,9 @@ - Added `postprocess_mock_data()` to apply `mock_spec` missing-code and garbage-value rules after baseline generation, with diagnostics that distinguish assigned missing/garbage rows from naturally drawn values. +- Added `generate_mock_data_simstudy()` as a soft-gated optional backend for + baseline categorical and uniform continuous generation when `simstudy` is + installed, with native generation retained for MockData-specific semantics. - Added forward-compatible specification fields: `spec_version`, `provenance`, and `model_hint`. - Existing v0.3 generator APIs remain available while v0.4 internals are built. diff --git a/R/mock_spec_simstudy.R b/R/mock_spec_simstudy.R new file mode 100644 index 0000000..776dce4 --- /dev/null +++ b/R/mock_spec_simstudy.R @@ -0,0 +1,169 @@ +# ============================================================================== +# MockData v0.4 Optional simstudy Backend +# ============================================================================== +# Baseline generation from mock_spec using simstudy when the optional package is +# installed. MockData still owns post-processing and diagnostics. +# ============================================================================== + +.require_simstudy <- function() { + if (!requireNamespace("simstudy", quietly = TRUE)) { + stop( + "The optional simstudy backend requires the 'simstudy' package. ", + "Install simstudy or use generate_mock_data_native().", + call. = FALSE + ) + } + + invisible(TRUE) +} + +.simstudy_definition <- function(def, variable) { + if (variable$type == "continuous") { + distribution <- tolower(variable$distribution %||% "uniform") + if (distribution == "uniform") { + return(simstudy::defData( + dtDefs = def, + varname = variable$name, + formula = paste(variable$range, collapse = ";"), + dist = "uniform" + )) + } + } + + if (variable$type == "categorical") { + probabilities <- variable$proportions + if (is.null(probabilities)) { + probabilities <- rep(1 / length(variable$levels), length(variable$levels)) + } + + return(simstudy::defData( + dtDefs = def, + varname = variable$name, + formula = paste(probabilities, collapse = ";"), + variance = paste(variable$levels, collapse = ";"), + dist = "categorical" + )) + } + + stop( + "simstudy backend does not yet support variable '", variable$name, + "' of type '", variable$type, "'.", + call. = FALSE + ) +} + +.simstudy_can_generate <- function(variable) { + if (variable$type == "categorical") { + return(TRUE) + } + + variable$type == "continuous" && + identical(tolower(variable$distribution %||% "uniform"), "uniform") +} + +.simstudy_variables <- function(spec) { + Filter(.simstudy_can_generate, spec$variables) +} + +.native_only_variables <- function(spec) { + Filter(function(variable) !.simstudy_can_generate(variable), spec$variables) +} + +.generate_simstudy_baseline <- function(variables, n) { + if (length(variables) == 0) { + return(.empty_native_data(n)) + } + + def <- NULL + for (variable in variables) { + def <- .simstudy_definition(def, variable) + } + + generated <- as.data.frame(simstudy::genData(n, def), stringsAsFactors = FALSE) + generated <- generated[, names(variables), drop = FALSE] + + for (variable_name in names(variables)) { + variable <- variables[[variable_name]] + if (variable$type == "continuous") { + generated[[variable_name]] <- .coerce_native_continuous( + generated[[variable_name]], + variable$rtype, + variable$name + ) + } else if (variable$type == "categorical") { + generated[[variable_name]] <- .coerce_native_categorical( + as.character(generated[[variable_name]]), + as.character(variable$levels), + variable$rtype, + variable$name + ) + } + } + + generated +} + +.generate_native_only_baseline <- function(variables, n) { + if (length(variables) == 0) { + return(.empty_native_data(n)) + } + + columns <- lapply(variables, .generate_native_variable, n = n) + names(columns) <- names(variables) + as.data.frame(columns, stringsAsFactors = FALSE, check.names = FALSE) +} + +#' Generate mock data with the optional simstudy backend +#' +#' `generate_mock_data_simstudy()` consumes a validated `mock_spec` and +#' generates baseline valid values through the optional `simstudy` package for +#' supported uniform continuous and categorical variables. MockData remains +#' responsible for missing-code injection, garbage values, and diagnostics +#' through [postprocess_mock_data()]. +#' +#' Variables that need MockData semantics not covered by this milestone, such as +#' truncated normal ranges and calendar dates, are generated by MockData's native +#' path inside the same seeded call. +#' +#' @param spec A `mock_spec` object. +#' @param n Non-negative whole number of rows to generate. +#' @param seed Optional whole-number random seed. The previous R random state is +#' restored after generation. +#' +#' @return A data frame with one column per `mock_spec` variable and `n` rows. +#' @family mock generation APIs +#' @seealso [generate_mock_data_native()], [postprocess_mock_data()], +#' [mock_spec()] +#' +#' @examples +#' spec <- mock_continuous("age", range = c(18, 85), rtype = "integer") +#' if (requireNamespace("simstudy", quietly = TRUE)) { +#' data <- generate_mock_data_simstudy(spec, n = 10, seed = 1) +#' head(data) +#' } +#' +#' @export +generate_mock_data_simstudy <- function(spec, n, seed = NULL) { + .require_simstudy() + validate_mock_spec(spec, n = n, strict = TRUE) + .check_native_backend_scope(spec) + + simstudy_variables <- .simstudy_variables(spec) + native_only_variables <- .native_only_variables(spec) + + .with_mock_seed(seed, { + simstudy_data <- .generate_simstudy_baseline(simstudy_variables, n) + native_data <- .generate_native_only_baseline(native_only_variables, n) + + columns <- c(simstudy_data, native_data) + if (length(spec$variables) == 0) { + return(.empty_native_data(n)) + } + + as.data.frame( + columns[names(spec$variables)], + stringsAsFactors = FALSE, + check.names = FALSE + ) + }) +} diff --git a/tests/testthat/test-simstudy-backend.R b/tests/testthat/test-simstudy-backend.R new file mode 100644 index 0000000..e479d44 --- /dev/null +++ b/tests/testthat/test-simstudy-backend.R @@ -0,0 +1,108 @@ +test_that("generate_mock_data_simstudy fails clearly when simstudy is unavailable", { + if (requireNamespace("simstudy", quietly = TRUE)) { + skip("simstudy is installed; unavailable-path test is not applicable") + } + + spec <- mock_continuous("age", range = c(18, 85)) + + expect_error( + generate_mock_data_simstudy(spec, n = 10, seed = 1), + "requires the 'simstudy' package" + ) +}) + +test_that("generate_mock_data_simstudy generates supported baseline specs", { + skip_if_not_installed("simstudy") + + spec <- mock_spec( + mock_spec_continuous("age", range = c(18, 85), rtype = "integer"), + mock_spec_continuous( + "bmi", + range = c(15, 50), + distribution = "normal", + mean = 27, + sd = 5 + ), + mock_spec_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + rtype = "character" + ), + mock_spec_date("interview_date", range = as.Date(c("2001-01-01", "2001-01-31"))) + ) + + result <- generate_mock_data_simstudy(spec, n = 1000, seed = 707) + + expect_named(result, c("age", "bmi", "smoking", "interview_date")) + expect_type(result$age, "integer") + expect_type(result$bmi, "double") + expect_type(result$smoking, "character") + expect_s3_class(result$interview_date, "Date") + expect_true(all(result$age >= 18 & result$age <= 85)) + expect_true(all(result$bmi >= 15 & result$bmi <= 50)) + expect_equal(mean(result$bmi), 27, tolerance = 1) + + observed <- prop.table(table(factor( + result$smoking, + levels = c("never", "former", "current") + ))) + expect_equal(as.numeric(observed), c(0.5, 0.3, 0.2), tolerance = 0.05) +}) + +test_that("generate_mock_data_simstudy composes with MockData post-processing", { + skip_if_not_installed("simstudy") + + spec <- mock_categorical( + "response", + levels = c("1", "97"), + proportions = c(0.6, 0.4), + rtype = "character", + missing_codes = "97", + missing_proportions = 0.2, + garbage_rules = list(low = list(proportion = 0.2, range = "[-2, 0]")) + ) + + baseline <- generate_mock_data_simstudy(spec, n = 100, seed = 808) + result <- postprocess_mock_data(baseline, spec, seed = 809) + diagnostics <- attr(result, "mockdata_diagnostics")$variables$response + + expect_equal(length(diagnostics$assigned_missing_indices), 20) + expect_true(length(diagnostics$preexisting_missing_code_indices) > 0) + expect_length(intersect( + diagnostics$preexisting_missing_code_indices, + diagnostics$assigned_garbage_indices$low + ), 0) +}) + +test_that("generate_mock_data_simstudy is reproducible", { + skip_if_not_installed("simstudy") + + spec <- mock_spec( + mock_spec_continuous("age", range = c(18, 85), rtype = "integer"), + mock_spec_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + rtype = "character" + ) + ) + + first <- generate_mock_data_simstudy(spec, n = 100, seed = 909) + second <- generate_mock_data_simstudy(spec, n = 100, seed = 909) + + expect_equal(first, second) +}) + +test_that("generate_mock_data_simstudy keeps deferred formula variables loud", { + skip_if_not_installed("simstudy") + + variable <- mock_spec_continuous("bmi", range = c(15, 50)) + variable$formula <- "weight / height^2" + spec <- mock_spec(variable) + + expect_error( + generate_mock_data_simstudy(spec, n = 10, seed = 1), + "Formula evaluation is not yet implemented" + ) +}) From 70da7b85815dc165b575856c073eb16d4f103997 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Tue, 19 May 2026 19:57:20 -0400 Subject: [PATCH 13/18] Harden optional simstudy backend --- DESCRIPTION | 2 +- R/mock_spec_simstudy.R | 51 +++++++++- tests/testthat/test-simstudy-backend.R | 127 ++++++++++++++++++++++++- 3 files changed, 177 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3f5a388..dbb0f20 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,7 +32,7 @@ Suggests: readr, stringr, lubridate, - simstudy, + simstudy (>= 0.8.1), knitr, quarto, devtools, diff --git a/R/mock_spec_simstudy.R b/R/mock_spec_simstudy.R index 776dce4..2535623 100644 --- a/R/mock_spec_simstudy.R +++ b/R/mock_spec_simstudy.R @@ -13,11 +13,40 @@ call. = FALSE ) } + if (utils::packageVersion("simstudy") < "0.8.1") { + stop( + "The optional simstudy backend requires simstudy >= 0.8.1. ", + "Install a newer simstudy version or use generate_mock_data_native().", + call. = FALSE + ) + } + + invisible(TRUE) +} + +.check_simstudy_variable <- function(variable) { + if (identical(variable$name, "id")) { + stop( + "Variable name 'id' conflicts with simstudy's generated row identifier. ", + "Rename the variable or use generate_mock_data_native().", + call. = FALSE + ) + } + + if (variable$type == "categorical" && any(grepl(";", as.character(variable$levels), fixed = TRUE))) { + stop( + "Variable '", variable$name, + "' has categorical level(s) containing ';', which simstudy uses as a delimiter.", + call. = FALSE + ) + } invisible(TRUE) } .simstudy_definition <- function(def, variable) { + .check_simstudy_variable(variable) + if (variable$type == "continuous") { distribution <- tolower(variable$distribution %||% "uniform") if (distribution == "uniform") { @@ -52,6 +81,26 @@ ) } +.normalize_simstudy_categorical <- function(values, variable) { + value_chr <- as.character(values) + levels <- as.character(variable$levels) + if (all(value_chr %in% levels)) { + return(value_chr) + } + + index <- suppressWarnings(as.integer(value_chr)) + if (!any(is.na(index)) && all(index >= 1 & index <= length(levels))) { + return(levels[index]) + } + + stop( + "simstudy returned categorical values for variable '", variable$name, + "' that do not match the mock_spec levels. ", + "This may indicate a simstudy version or delimiter mismatch.", + call. = FALSE + ) +} + .simstudy_can_generate <- function(variable) { if (variable$type == "categorical") { return(TRUE) @@ -92,7 +141,7 @@ ) } else if (variable$type == "categorical") { generated[[variable_name]] <- .coerce_native_categorical( - as.character(generated[[variable_name]]), + .normalize_simstudy_categorical(generated[[variable_name]], variable), as.character(variable$levels), variable$rtype, variable$name diff --git a/tests/testthat/test-simstudy-backend.R b/tests/testthat/test-simstudy-backend.R index e479d44..93924b5 100644 --- a/tests/testthat/test-simstudy-backend.R +++ b/tests/testthat/test-simstudy-backend.R @@ -50,6 +50,114 @@ test_that("generate_mock_data_simstudy generates supported baseline specs", { expect_equal(as.numeric(observed), c(0.5, 0.3, 0.2), tolerance = 0.05) }) +test_that("generate_mock_data_simstudy protects simstudy-specific categorical contracts", { + skip_if_not_installed("simstudy") + + semicolon <- mock_categorical( + "group", + levels = c("never;former", "current"), + proportions = c(0.5, 0.5), + rtype = "character" + ) + expect_error( + generate_mock_data_simstudy(semicolon, n = 10, seed = 1), + "containing ';'" + ) + + reserved <- mock_categorical( + "id", + levels = c("a", "b"), + proportions = c(0.5, 0.5), + rtype = "character" + ) + expect_error( + generate_mock_data_simstudy(reserved, n = 10, seed = 1), + "conflicts with simstudy" + ) +}) + +test_that("simstudy categorical normalization handles labels and old integer indices", { + variable <- mock_spec_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + rtype = "character" + ) + + expect_equal( + MockData:::.normalize_simstudy_categorical(c("never", "current"), variable), + c("never", "current") + ) + expect_equal( + MockData:::.normalize_simstudy_categorical(c(1L, 3L), variable), + c("never", "current") + ) + expect_error( + MockData:::.normalize_simstudy_categorical(c("mystery"), variable), + "do not match" + ) +}) + +test_that("generate_mock_data_simstudy roughly matches native contracts when installed", { + skip_if_not_installed("simstudy") + + spec <- mock_spec( + mock_spec_continuous("age", range = c(18, 85), rtype = "integer"), + mock_spec_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + rtype = "character" + ) + ) + + native <- generate_mock_data_native(spec, n = 5000, seed = 1001) + simstudy <- generate_mock_data_simstudy(spec, n = 5000, seed = 1001) + + expect_named(simstudy, names(native)) + expect_type(simstudy$age, "integer") + expect_type(simstudy$smoking, "character") + expect_equal(mean(simstudy$age), mean(native$age), tolerance = 2) + expect_equal(stats::sd(simstudy$age), stats::sd(native$age), tolerance = 2) + + observed <- prop.table(table(factor( + simstudy$smoking, + levels = c("never", "former", "current") + ))) + expect_equal(as.numeric(observed), c(0.5, 0.3, 0.2), tolerance = 0.05) +}) + +test_that("generate_mock_data_simstudy routes unsupported pieces through native backend", { + skip_if_not_installed("simstudy") + + spec <- mock_spec( + mock_spec_categorical( + "smoking", + levels = c("never", "former", "current"), + proportions = c(0.5, 0.3, 0.2), + rtype = "character" + ), + mock_spec_continuous( + "bmi", + range = c(15, 50), + distribution = "normal", + mean = 27, + sd = 5 + ), + mock_spec_date("interview_date", range = as.Date(c("2001-01-01", "2001-01-31"))) + ) + + expect_false(MockData:::.simstudy_can_generate(spec$variables$bmi)) + expect_false(MockData:::.simstudy_can_generate(spec$variables$interview_date)) + + result <- generate_mock_data_simstudy(spec, n = 1000, seed = 1002) + expect_true(all(result$bmi >= 15 & result$bmi <= 50)) + expect_equal(mean(result$bmi), 27, tolerance = 1) + expect_s3_class(result$interview_date, "Date") + expect_true(all(result$interview_date >= as.Date("2001-01-01"))) + expect_true(all(result$interview_date <= as.Date("2001-01-31"))) +}) + test_that("generate_mock_data_simstudy composes with MockData post-processing", { skip_if_not_installed("simstudy") @@ -91,7 +199,24 @@ test_that("generate_mock_data_simstudy is reproducible", { first <- generate_mock_data_simstudy(spec, n = 100, seed = 909) second <- generate_mock_data_simstudy(spec, n = 100, seed = 909) - expect_equal(first, second) + expect_identical(first, second) +}) + +test_that("generate_mock_data_simstudy handles empty specs and n = 0", { + skip_if_not_installed("simstudy") + + empty <- generate_mock_data_simstudy(mock_spec(), n = 5, seed = 1) + expect_s3_class(empty, "data.frame") + expect_equal(nrow(empty), 5) + expect_equal(ncol(empty), 0) + + zero <- generate_mock_data_simstudy( + mock_categorical("smoking", levels = c("never", "former", "current")), + n = 0, + seed = 1 + ) + expect_equal(nrow(zero), 0) + expect_named(zero, "smoking") }) test_that("generate_mock_data_simstudy keeps deferred formula variables loud", { From e109d546865fa925da454ebb2fac24b529b5b405 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Tue, 19 May 2026 20:06:09 -0400 Subject: [PATCH 14/18] Polish v0.4 reference documentation --- NEWS.md | 6 ++++++ R/mock_spec.R | 39 ++++++++++++++++++++++++++++++++++++++- R/mock_spec_native.R | 33 ++++++++++++++++++++++++++++++++- R/mock_spec_postprocess.R | 35 +++++++++++++++++++++++++++++++---- R/mock_spec_recodeflow.R | 38 +++++++++++++++++++++++++++++++++++++- R/mock_spec_simstudy.R | 30 ++++++++++++++++++++++++++++++ 6 files changed, 174 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index f5aa558..9bdd9b5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -19,9 +19,15 @@ - Added `postprocess_mock_data()` to apply `mock_spec` missing-code and garbage-value rules after baseline generation, with diagnostics that distinguish assigned missing/garbage rows from naturally drawn values. +- Post-processing diagnostics now protect naturally drawn missing-code + collisions from later garbage assignment, apply garbage rules in canonical + `low` -> `high` -> other order, and stop on repeated post-processing. - Added `generate_mock_data_simstudy()` as a soft-gated optional backend for baseline categorical and uniform continuous generation when `simstudy` is installed, with native generation retained for MockData-specific semantics. +- The optional `simstudy` backend is kept in `Suggests`, requires + `simstudy >= 0.8.1`, and validates categorical labels before converting + generated values back into MockData's `mock_spec` levels. - Added forward-compatible specification fields: `spec_version`, `provenance`, and `model_hint`. - Existing v0.3 generator APIs remain available while v0.4 internals are built. diff --git a/R/mock_spec.R b/R/mock_spec.R index 7fd3e51..5efd59e 100644 --- a/R/mock_spec.R +++ b/R/mock_spec.R @@ -46,10 +46,12 @@ NULL if (is.null(x)) y else x } +#' @noRd .is_non_empty_string <- function(x) { is.character(x) && length(x) == 1 && !is.na(x) && nzchar(trimws(x)) } +#' @noRd .normalize_provenance <- function(provenance, source = NULL) { if (is.null(provenance)) { provenance <- list(adapter = "direct", source = source %||% "direct") @@ -75,6 +77,7 @@ NULL ) } +#' @noRd .validate_model_hint <- function(model_hint) { if (length(model_hint) != 1 || is.na(model_hint) || !model_hint %in% .mock_spec_model_hints) { stop( @@ -87,6 +90,7 @@ NULL invisible(TRUE) } +#' @noRd .new_mock_spec_variable <- function(name, type, rtype, @@ -137,6 +141,7 @@ NULL ) } +#' @noRd .as_mock_spec_variable_list <- function(...) { variables <- list(...) @@ -160,6 +165,7 @@ NULL variables } +#' @noRd .direct_api_provenance <- function(source, provenance = NULL) { provenance <- provenance %||% list(source = source) @@ -178,6 +184,14 @@ NULL #' new architecture. Direct APIs and recodeflow adapters should both normalize #' into this shape before validation and generation. #' +#' @details +#' The v0.4 API is layered. The `mock_*()` helpers are the simple direct API for +#' one-variable specifications. The `mock_spec_*()` constructors create variable +#' specifications that can be composed with `mock_spec()`. Metadata adapters, +#' such as [mock_spec_from_recodeflow()], translate external metadata into the +#' same internal shape. Generation backends consume `mock_spec` objects rather +#' than re-reading user-facing metadata. +#' #' @param ... `mock_spec_variable` objects, or a single list of them. `NULL` #' creates an empty specification. #' @param spec_version Character version of the specification shape. @@ -189,7 +203,9 @@ NULL #' #' @return S3 object of class `mock_spec`. #' @family mock specification APIs -#' @seealso [mock_continuous()], [mock_categorical()], [mock_date()] +#' @seealso [mock_continuous()], [mock_categorical()], [mock_date()], +#' [mock_spec_from_recodeflow()], [generate_mock_data_native()], +#' [postprocess_mock_data()] #' #' @examples #' spec <- mock_spec( @@ -233,6 +249,11 @@ mock_spec <- function(..., #' returns a validated `mock_spec`; it does not generate data. Generation #' backends will consume this specification in a later v0.4 milestone. #' +#' @details +#' Use `mock_continuous()` when specifying one variable directly in R code. Use +#' [mock_spec_continuous()] with [mock_spec()] when composing several variables +#' or when writing an adapter from another metadata source. +#' #' @param name Variable name. #' @param range Numeric vector of length two giving the inclusive valid range. #' @param distribution Distribution name. Defaults to `"uniform"`. @@ -302,6 +323,11 @@ mock_continuous <- function(name, #' `mock_categorical()` is the simple direct API for categorical variables. It #' returns a validated `mock_spec`; it does not generate data. #' +#' @details +#' Use `mock_categorical()` when specifying one variable directly in R code. Use +#' [mock_spec_categorical()] with [mock_spec()] when composing several variables +#' or when writing an adapter from another metadata source. +#' #' @param name Variable name. #' @param levels Character vector of valid levels or codes. #' @param proportions Optional probabilities aligned to `levels`. @@ -363,6 +389,11 @@ mock_categorical <- function(name, #' `mock_date()` is the simple direct API for date variables. It returns a #' validated `mock_spec`; it does not generate data. #' +#' @details +#' Date variables default to `model_hint = "native-postprocess"` because MockData +#' owns calendar-date generation and source-format conversion. Optional +#' backends may still generate other variables in the same specification. +#' #' @param name Variable name. #' @param range Date vector of length two giving the inclusive valid date range. #' @param rtype R output type. Defaults to `"date"`. @@ -587,6 +618,7 @@ is_mock_spec <- function(x) { inherits(x, "mock_spec") } +#' @noRd .new_mock_spec_validation_result <- function(valid = TRUE, errors = character(0), warnings = character(0), @@ -631,6 +663,7 @@ print.mock_spec_validation_result <- function(x, ...) { invisible(x) } +#' @noRd .validate_probability_vector <- function(values, label, allow_null = FALSE) { errors <- character(0) @@ -655,6 +688,7 @@ print.mock_spec_validation_result <- function(x, ...) { errors } +#' @noRd .validate_provenance <- function(provenance, label) { errors <- character(0) @@ -671,6 +705,7 @@ print.mock_spec_validation_result <- function(x, ...) { errors } +#' @noRd .validate_missing_spec <- function(variable) { errors <- character(0) @@ -702,6 +737,7 @@ print.mock_spec_validation_result <- function(x, ...) { errors } +#' @noRd .validate_range <- function(range, variable_name, expected_class = "numeric") { errors <- character(0) @@ -726,6 +762,7 @@ print.mock_spec_validation_result <- function(x, ...) { errors } +#' @noRd .validate_mock_spec_variable <- function(variable) { errors <- character(0) diff --git a/R/mock_spec_native.R b/R/mock_spec_native.R index 990cbf2..53c8286 100644 --- a/R/mock_spec_native.R +++ b/R/mock_spec_native.R @@ -5,6 +5,7 @@ # garbage, diagnostics, and richer rType handling lands in later milestones. # ============================================================================== +#' @noRd .with_mock_seed <- function(seed, expr) { if (is.null(seed)) { return(force(expr)) @@ -19,6 +20,7 @@ old_seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE) } + # Generation should be reproducible without changing the caller's RNG stream. on.exit({ if (had_seed) { assign(".Random.seed", old_seed, envir = .GlobalEnv) @@ -31,10 +33,12 @@ force(expr) } +#' @noRd .empty_native_data <- function(n) { data.frame(row.names = seq_len(n)) } +#' @noRd .sample_indices <- function(n_levels, n, prob = NULL) { if (n == 0) { return(integer(0)) @@ -42,6 +46,7 @@ sample.int(n_levels, size = n, replace = TRUE, prob = prob) } +#' @noRd .native_formula_variables <- function(spec) { names(Filter(function(variable) { formula <- variable$formula @@ -50,6 +55,7 @@ }, spec$variables)) } +#' @noRd .check_native_backend_scope <- function(spec) { formula_variables <- .native_formula_variables(spec) if (length(formula_variables) > 0) { @@ -65,6 +71,7 @@ invisible(TRUE) } +#' @noRd .native_truncated_normal <- function(n, mean, sd, range, variable_name) { if (n == 0) { return(numeric(0)) @@ -84,6 +91,8 @@ } if (length(remaining) > 0) { + # Pathological truncation windows can make rejection sampling impractical; + # keep generation bounded and tell the caller the tail came from uniform. warning( "Variable '", variable_name, "': could not fill all truncated-normal values by rejection sampling; ", @@ -96,6 +105,7 @@ values } +#' @noRd .coerce_native_continuous <- function(values, rtype, variable_name) { if (rtype == "integer") { return(as.integer(round(values))) @@ -111,6 +121,7 @@ ) } +#' @noRd .coerce_native_categorical <- function(values, levels, rtype, variable_name) { if (rtype == "factor") { return(factor(values, levels = levels)) @@ -158,6 +169,7 @@ ) } +#' @noRd .coerce_native_date <- function(values, rtype, variable_name) { if (rtype == "date") { return(values) @@ -173,6 +185,7 @@ ) } +#' @noRd .generate_native_continuous <- function(variable, n) { distribution <- tolower(variable$distribution %||% "uniform") @@ -197,6 +210,7 @@ .coerce_native_continuous(values, variable$rtype, variable$name) } +#' @noRd .generate_native_categorical <- function(variable, n) { levels <- as.character(variable$levels) prob <- variable$proportions @@ -208,6 +222,7 @@ .coerce_native_categorical(values, levels, variable$rtype, variable$name) } +#' @noRd .generate_native_date <- function(variable, n) { distribution <- tolower(variable$distribution %||% "uniform") if (distribution != "uniform") { @@ -222,6 +237,8 @@ values <- as.Date(character(0)) } else { range_numeric <- as.integer(variable$range) + # Sample day offsets numerically, then restore Date class; this avoids + # locale-dependent date parsing during generation. offsets <- .sample_indices( range_numeric[[2]] - range_numeric[[1]] + 1, n @@ -232,6 +249,7 @@ .coerce_native_date(values, variable$rtype, variable$name) } +#' @noRd .generate_native_variable <- function(variable, n) { if (variable$type == "continuous") { return(.generate_native_continuous(variable, n)) @@ -257,6 +275,18 @@ #' not yet apply missing-code injection, garbage values, diagnostics, formula #' evaluation, or optional `simstudy` features. #' +#' @details +#' The native backend is the default MIT-licensed baseline engine. It currently +#' supports uniform continuous variables, truncated-normal continuous variables, +#' categorical variables, and uniform calendar dates. Missing codes, garbage +#' values, and diagnostics are intentionally handled by [postprocess_mock_data()] +#' so that all backends share the same audit trail. +#' +#' If `seed` is supplied, the previous R random state is restored after +#' generation. This gives reproducible output without advancing the caller's RNG +#' stream. Formula variables are rejected loudly until the formula/dependency +#' milestone promotes the spike evaluator into production. +#' #' @param spec A `mock_spec` object. #' @param n Non-negative whole number of rows to generate. #' @param seed Optional whole-number random seed. The previous R random state is @@ -264,7 +294,8 @@ #' #' @return A data frame with one column per `mock_spec` variable and `n` rows. #' @family mock generation APIs -#' @seealso [mock_spec()], [mock_continuous()], [mock_spec_from_recodeflow()] +#' @seealso [mock_spec()], [mock_continuous()], [mock_spec_from_recodeflow()], +#' [postprocess_mock_data()], [generate_mock_data_simstudy()] #' #' @examples #' spec <- mock_spec( diff --git a/R/mock_spec_postprocess.R b/R/mock_spec_postprocess.R index fc1b671..d2751d4 100644 --- a/R/mock_spec_postprocess.R +++ b/R/mock_spec_postprocess.R @@ -6,6 +6,7 @@ # value collisions. # ============================================================================== +#' @noRd .postprocess_empty_diagnostics <- function(spec, n) { variables <- lapply(spec$variables, function(variable) { garbage_rule_names <- names(variable$garbage_rules) @@ -38,6 +39,7 @@ ) } +#' @noRd .values_match_codes <- function(values, codes) { if (length(codes) == 0) { return(rep(FALSE, length(values))) @@ -46,6 +48,7 @@ as.character(values) %in% as.character(codes) } +#' @noRd .sample_postprocess_indices <- function(candidates, n, avoid = integer(0)) { if (n == 0) { return(integer(0)) @@ -65,6 +68,7 @@ ) } +#' @noRd .coerce_postprocess_values <- function(values, variable, target) { if (inherits(target, "factor")) { return(as.character(values)) @@ -105,6 +109,7 @@ as.character(values) } +#' @noRd .assign_postprocess_values <- function(target, indices, values) { if (length(indices) == 0) { return(target) @@ -121,6 +126,7 @@ target } +#' @noRd .generate_garbage_for_rule <- function(rule, variable, n) { if (n == 0) { return(vector(mode = "character", length = 0)) @@ -152,10 +158,14 @@ values } +#' @noRd .ordered_garbage_rule_names <- function(rule_names) { + # Keep the long-standing garbage convention deterministic: low rules run + # before high rules; any future rule names follow in caller order. c(intersect(c("low", "high"), rule_names), setdiff(rule_names, c("low", "high"))) } +#' @noRd .postprocess_missing <- function(values, variable, diagnostics) { if (length(variable$missing_codes) == 0) { return(list(values = values, diagnostics = diagnostics)) @@ -169,6 +179,8 @@ } available <- seq_along(values) + # Record values that naturally collide with declared missing codes before + # assigning any new missing codes; this is the auditability contract. preexisting <- which(.values_match_codes(values, variable$missing_codes)) assigned <- integer(0) assigned_codes <- character(0) @@ -202,6 +214,7 @@ list(values = values, diagnostics = diagnostics) } +#' @noRd .postprocess_garbage <- function(values, variable, diagnostics) { if (length(variable$garbage_rules) == 0) { return(list(values = values, diagnostics = diagnostics)) @@ -225,6 +238,9 @@ diagnostics$assigned_missing_indices, diagnostics$preexisting_missing_code_indices ) + # Garbage must not overwrite either assigned missing rows or naturally drawn + # missing-code collisions, otherwise diagnostics would no longer describe the + # returned data. valid_idx <- setdiff(which(!is.na(values)), protected_idx) remaining_idx <- valid_idx @@ -268,6 +284,7 @@ list(values = values, diagnostics = diagnostics) } +#' @noRd .postprocess_variable <- function(values, variable, diagnostics) { missing_result <- .postprocess_missing(values, variable, diagnostics) garbage_result <- .postprocess_garbage( @@ -297,12 +314,22 @@ #' @return A data frame with post-processing applied. #' #' @details -#' Diagnostics are stored as a data-frame attribute. Base R subsetting and some -#' downstream tools may drop attributes, so preserve the original post-processed -#' object when diagnostics are part of the audit trail. +#' Missing-code diagnostics separate values that were naturally drawn as a +#' declared missing code (`preexisting_missing_code_indices`) from values that +#' were assigned by post-processing (`assigned_missing_indices`). Garbage rules +#' are applied only to rows that are not missing-code diagnostics, preserving the +#' audit trail for collision cases such as a valid category code that is also a +#' declared missing code. +#' +#' Garbage rules are applied in canonical order: `low`, then `high`, then any +#' other named rules in caller order. Diagnostics are stored as a data-frame +#' attribute. Base R subsetting and some downstream tools may drop attributes, +#' so preserve the original post-processed object when diagnostics are part of +#' the audit trail. #' #' @family mock generation APIs -#' @seealso [generate_mock_data_native()], [mock_spec()] +#' @seealso [generate_mock_data_native()], [generate_mock_data_simstudy()], +#' [mock_spec()] #' #' @examples #' spec <- mock_categorical( diff --git a/R/mock_spec_recodeflow.R b/R/mock_spec_recodeflow.R index f713a15..bcf0f52 100644 --- a/R/mock_spec_recodeflow.R +++ b/R/mock_spec_recodeflow.R @@ -5,6 +5,7 @@ # normalized mock_spec representation. # ============================================================================== +#' @noRd .read_recodeflow_table <- function(x, label) { if (is.data.frame(x)) { return(x) @@ -14,6 +15,8 @@ if (!file.exists(x)) { stop(label, " file does not exist: ", x, call. = FALSE) } + # Pin CSV parsing so path inputs behave like data-frame inputs for the + # recodeflow conventions MockData understands. return(read.csv( x, stringsAsFactors = FALSE, @@ -25,10 +28,12 @@ stop(label, " must be a data frame or a single CSV path.", call. = FALSE) } +#' @noRd .is_blank <- function(x) { is.null(x) || length(x) == 0 || is.na(x[1]) || trimws(as.character(x[1])) == "" } +#' @noRd .row_value <- function(row, name, default = NA) { if (!name %in% names(row)) { return(default) @@ -42,6 +47,7 @@ value } +#' @noRd .row_character <- function(row, name, default = NA_character_) { value <- .row_value(row, name, default) if (.is_blank(value)) { @@ -50,6 +56,7 @@ as.character(value) } +#' @noRd .row_numeric <- function(row, name, default = NA_real_) { value <- .row_value(row, name, default) if (.is_blank(value)) { @@ -69,6 +76,7 @@ numeric_value } +#' @noRd .recodeflow_required_columns <- function(data, required, label) { missing <- setdiff(required, names(data)) if (length(missing) > 0) { @@ -76,6 +84,7 @@ } } +#' @noRd .filter_recodeflow_by_database <- function(data, databaseStart, allow_empty = TRUE) { if (is.null(databaseStart)) { return(data) @@ -87,9 +96,12 @@ ) } + # databaseStart fields are comma-separated tokens, not substrings; this keeps + # cycles such as cchs2017 and cchs2017_2018_p distinct. data[.database_start_matches(data$databaseStart, databaseStart, allow_empty = allow_empty), , drop = FALSE] } +#' @noRd .filter_recodeflow_details <- function(variable_details, variable, databaseStart) { if (is.null(variable_details)) { return(NULL) @@ -99,6 +111,7 @@ .filter_recodeflow_by_database(details, databaseStart, allow_empty = TRUE) } +#' @noRd .recodeflow_variable_kind <- function(var_row) { rtype <- tolower(.row_character(var_row, "rType", "")) variable_type <- tolower(.row_character(var_row, "variableType", "")) @@ -121,6 +134,7 @@ ) } +#' @noRd .recodeflow_rtype <- function(var_row, kind) { rtype <- tolower(.row_character(var_row, "rType", "")) if (rtype != "") { @@ -138,6 +152,7 @@ ) } +#' @noRd .parse_single_date <- function(value) { if (.is_blank(value)) { return(NULL) @@ -151,6 +166,7 @@ NULL } +#' @noRd .recodeflow_valid_rows <- function(details) { if (is.null(details) || nrow(details) == 0) { return(details) @@ -159,6 +175,8 @@ rec_start <- as.character(details$recStart) rec_end <- if ("recEnd" %in% names(details)) as.character(details$recEnd) else rep("", nrow(details)) + # Functional and derived rows live in recStart; recEnd carries missing-code + # semantics such as NA::a / NA::b. keep <- !is.na(rec_start) & rec_start != "" & rec_start != "else" & @@ -170,6 +188,7 @@ details[keep, , drop = FALSE] } +#' @noRd .recodeflow_range <- function(details, variable, kind) { valid_rows <- .recodeflow_valid_rows(details) if (is.null(valid_rows) || nrow(valid_rows) == 0) { @@ -199,6 +218,7 @@ stop("Variable '", variable, "' has no parseable ", kind, " range in recStart.", call. = FALSE) } +#' @noRd .recodeflow_missing <- function(details) { if (is.null(details) || nrow(details) == 0 || !"recEnd" %in% names(details)) { return(list(codes = character(0), proportions = numeric(0))) @@ -223,6 +243,7 @@ ) } +#' @noRd .recodeflow_distribution <- function(var_row, details) { distribution <- tolower(.row_character(var_row, "distribution", "")) if (distribution != "") { @@ -245,6 +266,7 @@ params$distribution %||% "uniform" } +#' @noRd .recodeflow_garbage_rules <- function(var_row) { rules <- list() @@ -269,6 +291,7 @@ rules } +#' @noRd .recodeflow_provenance <- function(variable, databaseStart = NULL) { provenance <- list(adapter = "recodeflow", source = variable) if (!is.null(databaseStart)) { @@ -277,6 +300,7 @@ provenance } +#' @noRd .recodeflow_to_spec_variable <- function(var_row, details, databaseStart) { variable <- .row_character(var_row, "variable") kind <- .recodeflow_variable_kind(var_row) @@ -364,6 +388,12 @@ #' generated. Set `exclude_derived = FALSE` only when you want those rows to #' appear in the adapter input and fail or be handled by later formula support. #' +#' CSV path inputs are read with `stringsAsFactors = FALSE`, +#' `check.names = FALSE`, and `na.strings = c("", "NA")` so path-based inputs +#' preserve recodeflow column names and treat blank metadata cells like missing +#' values. The adapter normalizes `rType = "numeric"` to `"double"` to match +#' the v0.4 `mock_spec` type vocabulary. +#' #' @param variables Data frame or CSV path for recodeflow-style `variables` #' metadata. #' @param variable_details Data frame, CSV path, or `NULL` for recodeflow-style @@ -380,7 +410,7 @@ #' @return A validated `mock_spec` object. #' @family mock specification APIs #' @seealso [mock_spec()], [mock_continuous()], [mock_categorical()], -#' [mock_date()] +#' [mock_date()], [generate_mock_data_native()], [postprocess_mock_data()] #' #' @examples #' variables <- data.frame( @@ -399,6 +429,12 @@ #' spec <- mock_spec_from_recodeflow(variables, details) #' validate_mock_spec(spec) #' +#' variables_file <- tempfile(fileext = ".csv") +#' details_file <- tempfile(fileext = ".csv") +#' write.csv(variables, variables_file, row.names = FALSE) +#' write.csv(details, details_file, row.names = FALSE) +#' spec_from_files <- mock_spec_from_recodeflow(variables_file, details_file) +#' #' @export mock_spec_from_recodeflow <- function(variables, variable_details = NULL, diff --git a/R/mock_spec_simstudy.R b/R/mock_spec_simstudy.R index 2535623..a6b08fc 100644 --- a/R/mock_spec_simstudy.R +++ b/R/mock_spec_simstudy.R @@ -5,6 +5,7 @@ # installed. MockData still owns post-processing and diagnostics. # ============================================================================== +#' @noRd .require_simstudy <- function() { if (!requireNamespace("simstudy", quietly = TRUE)) { stop( @@ -24,6 +25,7 @@ invisible(TRUE) } +#' @noRd .check_simstudy_variable <- function(variable) { if (identical(variable$name, "id")) { stop( @@ -34,6 +36,8 @@ } if (variable$type == "categorical" && any(grepl(";", as.character(variable$levels), fixed = TRUE))) { + # simstudy encodes categorical probabilities and labels as semicolon- + # delimited strings; labels containing ';' cannot round-trip safely. stop( "Variable '", variable$name, "' has categorical level(s) containing ';', which simstudy uses as a delimiter.", @@ -44,6 +48,7 @@ invisible(TRUE) } +#' @noRd .simstudy_definition <- function(def, variable) { .check_simstudy_variable(variable) @@ -65,6 +70,8 @@ probabilities <- rep(1 / length(variable$levels), length(variable$levels)) } + # defData() expects categorical probabilities in formula and labels in + # variance, both as semicolon-delimited vectors. return(simstudy::defData( dtDefs = def, varname = variable$name, @@ -81,6 +88,7 @@ ) } +#' @noRd .normalize_simstudy_categorical <- function(values, variable) { value_chr <- as.character(values) levels <- as.character(variable$levels) @@ -88,6 +96,8 @@ return(value_chr) } + # Older simstudy versions returned 1-based category indices in some paths; + # keep the adapter explicit so cross-version drift cannot masquerade as data. index <- suppressWarnings(as.integer(value_chr)) if (!any(is.na(index)) && all(index >= 1 & index <= length(levels))) { return(levels[index]) @@ -101,6 +111,7 @@ ) } +#' @noRd .simstudy_can_generate <- function(variable) { if (variable$type == "categorical") { return(TRUE) @@ -110,14 +121,17 @@ identical(tolower(variable$distribution %||% "uniform"), "uniform") } +#' @noRd .simstudy_variables <- function(spec) { Filter(.simstudy_can_generate, spec$variables) } +#' @noRd .native_only_variables <- function(spec) { Filter(function(variable) !.simstudy_can_generate(variable), spec$variables) } +#' @noRd .generate_simstudy_baseline <- function(variables, n) { if (length(variables) == 0) { return(.empty_native_data(n)) @@ -152,6 +166,7 @@ generated } +#' @noRd .generate_native_only_baseline <- function(variables, n) { if (length(variables) == 0) { return(.empty_native_data(n)) @@ -174,6 +189,19 @@ #' truncated normal ranges and calendar dates, are generated by MockData's native #' path inside the same seeded call. #' +#' @details +#' `simstudy` is an optional `Suggests` dependency and is GPL-3 licensed; +#' MockData remains MIT licensed by keeping this backend soft-gated and by +#' retaining [generate_mock_data_native()] as the default engine. Use this +#' backend when `simstudy` is installed and you want to exercise the optional +#' engine path. It currently delegates only categorical and uniform continuous +#' baseline generation to `simstudy`; unsupported variables are routed through +#' MockData's native backend so that a single specification can mix capabilities. +#' +#' Missing-code assignment, garbage values, and diagnostics are not delegated to +#' `simstudy`. They remain MockData-owned post-processing so both backends share +#' the same auditability contract. +#' #' @param spec A `mock_spec` object. #' @param n Non-negative whole number of rows to generate. #' @param seed Optional whole-number random seed. The previous R random state is @@ -202,6 +230,8 @@ generate_mock_data_simstudy <- function(spec, n, seed = NULL) { .with_mock_seed(seed, { simstudy_data <- .generate_simstudy_baseline(simstudy_variables, n) + # Normal/date variables stay native until MockData has an explicit contract + # for mapping their range and calendar semantics into simstudy definitions. native_data <- .generate_native_only_baseline(native_only_variables, n) columns <- c(simstudy_data, native_data) From 3d448df827331d9e201860e0ee6cf54665b08b43 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Wed, 20 May 2026 06:26:10 -0400 Subject: [PATCH 15/18] Clarify v0.4 postprocess and simstudy notes --- NEWS.md | 7 ++++++- R/mock_spec_postprocess.R | 11 +++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9bdd9b5..3fcdd78 100644 --- a/NEWS.md +++ b/NEWS.md @@ -21,13 +21,18 @@ distinguish assigned missing/garbage rows from naturally drawn values. - Post-processing diagnostics now protect naturally drawn missing-code collisions from later garbage assignment, apply garbage rules in canonical - `low` -> `high` -> other order, and stop on repeated post-processing. + `low` -> `high` -> other order, and stop on repeated post-processing. This + prevents silent diagnostic drift when a naturally drawn missing-code value + would otherwise be overwritten by garbage assignment. - Added `generate_mock_data_simstudy()` as a soft-gated optional backend for baseline categorical and uniform continuous generation when `simstudy` is installed, with native generation retained for MockData-specific semantics. - The optional `simstudy` backend is kept in `Suggests`, requires `simstudy >= 0.8.1`, and validates categorical labels before converting generated values back into MockData's `mock_spec` levels. +- The optional `simstudy` backend now rejects variables named `id`, which + conflicts with `simstudy`'s generated row identifier, and normalizes + categorical output through an explicit label-or-index validation path. - Added forward-compatible specification fields: `spec_version`, `provenance`, and `model_hint`. - Existing v0.3 generator APIs remain available while v0.4 internals are built. diff --git a/R/mock_spec_postprocess.R b/R/mock_spec_postprocess.R index d2751d4..8b394f2 100644 --- a/R/mock_spec_postprocess.R +++ b/R/mock_spec_postprocess.R @@ -322,10 +322,13 @@ #' declared missing code. #' #' Garbage rules are applied in canonical order: `low`, then `high`, then any -#' other named rules in caller order. Diagnostics are stored as a data-frame -#' attribute. Base R subsetting and some downstream tools may drop attributes, -#' so preserve the original post-processed object when diagnostics are part of -#' the audit trail. +#' other named rules in caller order. Each garbage rule is a named list with a +#' `proportion` field and a `range` field using MockData range notation, for +#' example `list(high = list(proportion = 0.05, range = "[150, 200]"))`. +#' +#' Diagnostics are stored as a data-frame attribute. Base R subsetting and some +#' downstream tools may drop attributes, so preserve the original post-processed +#' object when diagnostics are part of the audit trail. #' #' @family mock generation APIs #' @seealso [generate_mock_data_native()], [generate_mock_data_simstudy()], From 9fe335e7cd3e9fd7ebe4732a984ef47619f324b7 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Wed, 20 May 2026 06:33:18 -0400 Subject: [PATCH 16/18] Route create_mock_data through mock_spec pipeline --- NEWS.md | 3 + R/create_mock_data.R | 109 ++++++++++++++++++++- tests/testthat/test-create-mock-data-v04.R | 105 ++++++++++++++++++++ 3 files changed, 215 insertions(+), 2 deletions(-) create mode 100644 tests/testthat/test-create-mock-data-v04.R diff --git a/NEWS.md b/NEWS.md index 3fcdd78..c92e2a5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,6 +33,9 @@ - The optional `simstudy` backend now rejects variables named `id`, which conflicts with `simstudy`'s generated row identifier, and normalizes categorical output through an explicit label-or-index validation path. +- `create_mock_data()` now attempts the v0.4 `mock_spec` pipeline in strict + mode for supported recodeflow metadata, while retaining the legacy `create_*` + dispatch path for unsupported v0.4 backend features and lenient generation. - Added forward-compatible specification fields: `spec_version`, `provenance`, and `model_hint`. - Existing v0.3 generator APIs remain available while v0.4 internals are built. diff --git a/R/create_mock_data.R b/R/create_mock_data.R index 2f5494c..158dc41 100644 --- a/R/create_mock_data.R +++ b/R/create_mock_data.R @@ -1,3 +1,83 @@ +#' @noRd +.create_mock_data_v04_database_filter <- function(variables, databaseStart) { + if (!is.null(databaseStart) && "databaseStart" %in% names(variables)) { + return(databaseStart) + } + + NULL +} + +#' @noRd +.create_mock_data_v04_native_supported <- function(spec) { + all(vapply(spec$variables, function(variable) { + formula <- variable$formula + has_formula <- !is.null(formula) && + !(is.character(formula) && length(formula) == 1 && (is.na(formula) || trimws(formula) == "")) + if (has_formula) { + return(FALSE) + } + + distribution <- tolower(variable$distribution %||% "uniform") + if (variable$type == "continuous") { + return(distribution %in% c("uniform", "normal")) + } + if (variable$type == "categorical") { + return(TRUE) + } + if (variable$type == "date") { + return(distribution == "uniform" && identical(variable$source_format %||% "analysis", "analysis")) + } + + FALSE + }, logical(1))) +} + +#' @noRd +.create_mock_data_v04 <- function(databaseStart, + variables, + variable_details, + n, + seed, + verbose = FALSE) { + if (!is.null(databaseStart) && + !"databaseStart" %in% names(variables) && + "databaseStart" %in% names(variable_details)) { + if (isTRUE(verbose)) { + message( + "v0.4 mock_spec pipeline requires variable-level databaseStart when ", + "detail-level databaseStart filtering is needed; using legacy ", + "create_* dispatch." + ) + } + return(NULL) + } + + spec <- mock_spec_from_recodeflow( + variables = variables, + variable_details = variable_details, + databaseStart = .create_mock_data_v04_database_filter(variables, databaseStart), + role = "enabled" + ) + + if (!.create_mock_data_v04_native_supported(spec)) { + if (isTRUE(verbose)) { + message( + "v0.4 mock_spec pipeline does not yet support every requested ", + "variable; using legacy create_* dispatch." + ) + } + return(NULL) + } + + if (isTRUE(verbose)) { + message("Generating via v0.4 mock_spec pipeline.") + } + + baseline <- generate_mock_data_native(spec, n = n, seed = seed) + postprocess_seed <- if (is.null(seed)) NULL else seed + 1L + postprocess_mock_data(baseline, spec, seed = postprocess_seed) +} + #' Create mock data from configuration files #' #' @description @@ -38,8 +118,16 @@ #' @return Data frame with n rows and one column per enabled variable. #' #' @details -#' **v0.3.0 API**: This function now follows the "recodeflow pattern" where it passes -#' full metadata data frames to create_* functions, which handle internal filtering. +#' **v0.4.0 transition**: In strict mode, this function first attempts to use +#' the v0.4 `mock_spec` pipeline: [mock_spec_from_recodeflow()], +#' [generate_mock_data_native()], and [postprocess_mock_data()]. If the metadata +#' requests a feature not yet supported by the v0.4 native backend, it falls +#' back to the v0.3 `create_*` dispatch path so existing users can migrate +#' gradually. +#' +#' **v0.3.0 API**: This function follows the "recodeflow pattern" where it passes +#' full metadata data frames to create_* functions, which handle internal +#' filtering. #' #' **Generation process**: #' \enumerate{ @@ -155,6 +243,23 @@ create_mock_data <- function(databaseStart, stop("variables must have a 'variableType' column") } + # ========== v0.4 PIPELINE PATH ========== + + if (isTRUE(validate) && !is.null(variable_details)) { + v04_result <- .create_mock_data_v04( + databaseStart = databaseStart, + variables = variables, + variable_details = variable_details, + n = n, + seed = seed, + verbose = verbose + ) + + if (!is.null(v04_result)) { + return(v04_result) + } + } + # ========== FILTER FOR ENABLED VARIABLES ========== if (verbose) message("Filtering for enabled variables...") diff --git a/tests/testthat/test-create-mock-data-v04.R b/tests/testthat/test-create-mock-data-v04.R new file mode 100644 index 0000000..916a82b --- /dev/null +++ b/tests/testthat/test-create-mock-data-v04.R @@ -0,0 +1,105 @@ +test_that("create_mock_data uses the v0.4 pipeline for strict supported metadata", { + variables <- data.frame( + variable = "smoking", + variableType = "Categorical", + rType = "character", + role = "enabled", + garbage_low_prop = 0.1, + garbage_low_range = "[-2, 0]", + stringsAsFactors = FALSE + ) + variable_details <- data.frame( + variable = "smoking", + recStart = c("1", "2", "97"), + recEnd = c("copy", "copy", "NA::b"), + proportion = c(0.6, 0.3, 0.1), + stringsAsFactors = FALSE + ) + + result <- create_mock_data( + databaseStart = "study", + variables = variables, + variable_details = variable_details, + n = 100, + seed = 101 + ) + diagnostics <- attr(result, "mockdata_diagnostics")$variables$smoking + + expect_equal(names(result), "smoking") + expect_true(all(result$smoking %in% c("1", "2", "97", "-2", "-1", "0"))) + expect_equal(length(diagnostics$assigned_missing_indices), 10) + expect_equal(length(diagnostics$assigned_garbage_indices$low), 9) + expect_length(intersect( + diagnostics$assigned_missing_indices, + diagnostics$assigned_garbage_indices$low + ), 0) +}) + +test_that("create_mock_data keeps legacy fallback for unsupported v0.4 backend features", { + variables <- data.frame( + variable = "time_to_visit", + variableType = "Continuous", + rType = "double", + role = "enabled", + distribution = "exponential", + rate = 0.5, + stringsAsFactors = FALSE + ) + variable_details <- data.frame( + variable = "time_to_visit", + recStart = "[0, 10]", + recEnd = "copy", + proportion = 1, + stringsAsFactors = FALSE + ) + + expect_message( + result <- create_mock_data( + databaseStart = "study", + variables = variables, + variable_details = variable_details, + n = 50, + seed = 202, + verbose = TRUE + ), + "legacy create_\\* dispatch" + ) + + expect_equal(names(result), "time_to_visit") + expect_equal(nrow(result), 50) + expect_true(is.numeric(result$time_to_visit)) + expect_null(attr(result, "mockdata_diagnostics")) +}) + +test_that("create_mock_data keeps legacy detail-level databaseStart filtering", { + variables <- data.frame( + variable = "smoking", + variableType = "Categorical", + rType = "character", + role = "enabled", + stringsAsFactors = FALSE + ) + variable_details <- data.frame( + variable = c("smoking", "smoking"), + recStart = c("1", "2"), + recEnd = c("copy", "copy"), + proportion = c(1, 1), + databaseStart = c("cycle1", "cycle10"), + stringsAsFactors = FALSE + ) + + expect_message( + result <- create_mock_data( + databaseStart = "cycle1", + variables = variables, + variable_details = variable_details, + n = 10, + seed = 303, + verbose = TRUE + ), + "detail-level databaseStart filtering" + ) + + expect_equal(unique(result$smoking), "1") + expect_null(attr(result, "mockdata_diagnostics")) +}) From ca00377c84d5035e176743a830f9cfb3a1a8a63e Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Wed, 20 May 2026 07:16:49 -0400 Subject: [PATCH 17/18] Document and test create_mock_data v04 routing --- NEWS.md | 4 + R/create_mock_data.R | 60 ++++++++--- tests/testthat/test-create-mock-data-v04.R | 111 ++++++++++++++++++++- 3 files changed, 160 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index c92e2a5..434a5e8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -36,6 +36,10 @@ - `create_mock_data()` now attempts the v0.4 `mock_spec` pipeline in strict mode for supported recodeflow metadata, while retaining the legacy `create_*` dispatch path for unsupported v0.4 backend features and lenient generation. + The v0.4 path attaches `mockdata_diagnostics` and uses `seed` for baseline + generation plus `seed + 1` for post-processing, so exact seeded output may + differ from v0.3.x even when the public seed is unchanged. Verbose mode now + reports whether the v0.4 or legacy path was chosen. - Added forward-compatible specification fields: `spec_version`, `provenance`, and `model_hint`. - Existing v0.3 generator APIs remain available while v0.4 internals are built. diff --git a/R/create_mock_data.R b/R/create_mock_data.R index 158dc41..2ba23ec 100644 --- a/R/create_mock_data.R +++ b/R/create_mock_data.R @@ -8,28 +8,35 @@ } #' @noRd -.create_mock_data_v04_native_supported <- function(spec) { - all(vapply(spec$variables, function(variable) { +.create_mock_data_v04_unsupported_variables <- function(spec) { + unsupported <- vapply(spec$variables, function(variable) { formula <- variable$formula has_formula <- !is.null(formula) && !(is.character(formula) && length(formula) == 1 && (is.na(formula) || trimws(formula) == "")) if (has_formula) { - return(FALSE) + return(TRUE) } distribution <- tolower(variable$distribution %||% "uniform") if (variable$type == "continuous") { - return(distribution %in% c("uniform", "normal")) + return(!distribution %in% c("uniform", "normal")) } if (variable$type == "categorical") { - return(TRUE) + return(FALSE) } if (variable$type == "date") { - return(distribution == "uniform" && identical(variable$source_format %||% "analysis", "analysis")) + return(!(distribution == "uniform" && identical(variable$source_format %||% "analysis", "analysis"))) } - FALSE - }, logical(1))) + TRUE + }, logical(1)) + + names(spec$variables)[unsupported] +} + +#' @noRd +.create_mock_data_v04_native_supported <- function(spec) { + length(.create_mock_data_v04_unsupported_variables(spec)) == 0 } #' @noRd @@ -59,11 +66,13 @@ role = "enabled" ) - if (!.create_mock_data_v04_native_supported(spec)) { + unsupported <- .create_mock_data_v04_unsupported_variables(spec) + if (length(unsupported) > 0) { if (isTRUE(verbose)) { message( "v0.4 mock_spec pipeline does not yet support every requested ", - "variable; using legacy create_* dispatch." + "variable; using legacy create_* dispatch. Unsupported variable(s): ", + paste(unsupported, collapse = ", ") ) } return(NULL) @@ -74,6 +83,9 @@ } baseline <- generate_mock_data_native(spec, n = n, seed = seed) + # The wrapper uses a second deterministic stream for post-processing so + # baseline generation and missing/garbage assignment can be reproduced + # independently from the single public seed. postprocess_seed <- if (is.null(seed)) NULL else seed + 1L postprocess_mock_data(baseline, spec, seed = postprocess_seed) } @@ -115,7 +127,10 @@ #' affected variable is skipped. #' @param verbose Logical. Whether to print progress messages (default FALSE). #' -#' @return Data frame with n rows and one column per enabled variable. +#' @return Data frame with n rows and one column per enabled variable. When the +#' v0.4 `mock_spec` path is used, the result also carries a +#' `mockdata_diagnostics` attribute from [postprocess_mock_data()]. Legacy +#' fallback paths return plain data frames without that attribute. #' #' @details #' **v0.4.0 transition**: In strict mode, this function first attempts to use @@ -125,6 +140,16 @@ #' back to the v0.3 `create_*` dispatch path so existing users can migrate #' gradually. #' +#' The wrapper deliberately stays on the legacy path when `validate = FALSE`, +#' when `variable_details = NULL`, when detail-level `databaseStart` filtering is +#' needed but the variables metadata has no `databaseStart` column, or when a +#' variable uses a feature not yet supported by the v0.4 native backend. Set +#' `verbose = TRUE` to see which path was chosen. +#' +#' In the v0.4 path, `seed` is used for baseline generation and `seed + 1` is +#' used for post-processing. This makes both stages deterministic, but generated +#' values may differ from v0.3.x output for the same seed. +#' #' **v0.3.0 API**: This function follows the "recodeflow pattern" where it passes #' full metadata data frames to create_* functions, which handle internal #' filtering. @@ -194,6 +219,9 @@ #' } #' #' @family generators +#' @family mock generation APIs +#' @seealso [mock_spec_from_recodeflow()], [generate_mock_data_native()], +#' [postprocess_mock_data()], [generate_mock_data_simstudy()], [mock_spec()] #' @export create_mock_data <- function(databaseStart, variables, @@ -245,7 +273,15 @@ create_mock_data <- function(databaseStart, # ========== v0.4 PIPELINE PATH ========== - if (isTRUE(validate) && !is.null(variable_details)) { + if (!isTRUE(validate)) { + if (verbose) { + message("validate = FALSE requested; using legacy create_* dispatch.") + } + } else if (is.null(variable_details)) { + if (verbose) { + message("variable_details = NULL; using legacy create_* fallback dispatch.") + } + } else { v04_result <- .create_mock_data_v04( databaseStart = databaseStart, variables = variables, diff --git a/tests/testthat/test-create-mock-data-v04.R b/tests/testthat/test-create-mock-data-v04.R index 916a82b..781ffde 100644 --- a/tests/testthat/test-create-mock-data-v04.R +++ b/tests/testthat/test-create-mock-data-v04.R @@ -27,8 +27,10 @@ test_that("create_mock_data uses the v0.4 pipeline for strict supported metadata expect_equal(names(result), "smoking") expect_true(all(result$smoking %in% c("1", "2", "97", "-2", "-1", "0"))) - expect_equal(length(diagnostics$assigned_missing_indices), 10) - expect_equal(length(diagnostics$assigned_garbage_indices$low), 9) + expect_true(length(diagnostics$assigned_missing_indices) >= 8) + expect_true(length(diagnostics$assigned_missing_indices) <= 12) + expect_true(length(diagnostics$assigned_garbage_indices$low) >= 7) + expect_true(length(diagnostics$assigned_garbage_indices$low) <= 11) expect_length(intersect( diagnostics$assigned_missing_indices, diagnostics$assigned_garbage_indices$low @@ -62,7 +64,7 @@ test_that("create_mock_data keeps legacy fallback for unsupported v0.4 backend f seed = 202, verbose = TRUE ), - "legacy create_\\* dispatch" + "Unsupported variable\\(s\\): time_to_visit" ) expect_equal(names(result), "time_to_visit") @@ -103,3 +105,106 @@ test_that("create_mock_data keeps legacy detail-level databaseStart filtering", expect_equal(unique(result$smoking), "1") expect_null(attr(result, "mockdata_diagnostics")) }) + +test_that("create_mock_data announces validate FALSE legacy path", { + variables <- data.frame( + variable = "smoking", + variableType = "Categorical", + rType = "character", + role = "enabled", + stringsAsFactors = FALSE + ) + variable_details <- data.frame( + variable = "smoking", + recStart = c("1", "2"), + recEnd = c("copy", "copy"), + proportion = c(0.6, 0.4), + stringsAsFactors = FALSE + ) + + expect_message( + result <- create_mock_data( + databaseStart = "study", + variables = variables, + variable_details = variable_details, + n = 20, + seed = 404, + validate = FALSE, + verbose = TRUE + ), + "validate = FALSE requested" + ) + + expect_equal(names(result), "smoking") + expect_null(attr(result, "mockdata_diagnostics")) +}) + +test_that("create_mock_data announces variable_details NULL legacy fallback path", { + variables <- data.frame( + variable = "age", + variableType = "Continuous", + rType = "integer", + role = "enabled", + stringsAsFactors = FALSE + ) + + expect_warning( + expect_message( + result <- create_mock_data( + databaseStart = "study", + variables = variables, + variable_details = NULL, + n = 20, + seed = 505, + verbose = TRUE + ), + "variable_details = NULL" + ), + "No variable_details rows found" + ) + + expect_equal(names(result), "age") + expect_null(attr(result, "mockdata_diagnostics")) +}) + +test_that("create_mock_data v0.4 and legacy paths are distributionally aligned", { + variables <- data.frame( + variable = "smoking", + variableType = "Categorical", + rType = "character", + role = "enabled", + stringsAsFactors = FALSE + ) + variable_details <- data.frame( + variable = "smoking", + recStart = c("1", "2"), + recEnd = c("copy", "copy"), + proportion = c(0.65, 0.35), + stringsAsFactors = FALSE + ) + + v04 <- create_mock_data( + databaseStart = "study", + variables = variables, + variable_details = variable_details, + n = 5000, + seed = 606 + ) + legacy <- create_mock_data( + databaseStart = "study", + variables = variables, + variable_details = variable_details, + n = 5000, + seed = 606, + validate = FALSE + ) + + expect_equal(sort(unique(v04$smoking)), sort(unique(legacy$smoking))) + expect_equal( + unname(prop.table(table(v04$smoking))[c("1", "2")]), + unname(prop.table(table(legacy$smoking))[c("1", "2")]), + tolerance = 0.03 + ) + expect_type(attr(v04, "mockdata_diagnostics"), "list") + expect_null(attr(legacy, "mockdata_diagnostics")) +}) From 54047316b57c765fa8f79eafc9120041b1a948bf Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Wed, 20 May 2026 08:57:07 -0400 Subject: [PATCH 18/18] Add v04 functions to pkgdown reference index --- _pkgdown.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 5347b43..d17d53a 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -13,6 +13,32 @@ footer: developed_by:

Developed by Juan Li, Douglas Manuel, and recodeflow contributors.

reference: +- title: v0.4 specification API + desc: > + Build and validate normalized `mock_spec` objects using direct helpers or + composable variable constructors. + contents: + - mock_spec + - mock_continuous + - mock_categorical + - mock_date + - mock_spec_continuous + - mock_spec_categorical + - mock_spec_date + - is_mock_spec + - validate_mock_spec + - mock_spec_model_hints + +- title: v0.4 adapters and generation pipeline + desc: > + Convert recodeflow metadata into `mock_spec` objects, generate baseline data, + and apply post-processing diagnostics. + contents: + - mock_spec_from_recodeflow + - generate_mock_data_native + - generate_mock_data_simstudy + - postprocess_mock_data + - title: Main generation functions desc: > Generate categorical, continuous, date, and survival variables. Use `create_mock_data()`