From 9d23b2759dc4000ae3d0f33a5c7a4c2585d8170b Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 06:52:45 -0400
Subject: [PATCH 01/18] Add v0.4 mock_spec foundation

---
 NAMESPACE                                  |    6 +
 R/mock_spec.R                              |  477 +++++++++
 development/adr/v04-hybrid-backend.md      |  115 +++
 development/simstudy-v04.md                |  128 +++
 development/v04-simstudy-spike/README.md   |  155 ---
 development/v04-simstudy-spike/prototype.R | 1071 --------------------
 tests/testthat/test-mock-spec.R            |  143 +++
 7 files changed, 869 insertions(+), 1226 deletions(-)
 create mode 100644 R/mock_spec.R
 create mode 100644 development/adr/v04-hybrid-backend.md
 create mode 100644 development/simstudy-v04.md
 delete mode 100644 development/v04-simstudy-spike/README.md
 delete mode 100644 development/v04-simstudy-spike/prototype.R
 create mode 100644 tests/testthat/test-mock-spec.R

diff --git a/NAMESPACE b/NAMESPACE
index c3b3e55..33c9e9a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -23,7 +23,12 @@ export(get_variables_by_role)
 export(has_garbage)
 export(identify_derived_vars)
 export(import_from_recodeflow)
+export(is_mock_spec)
 export(make_garbage)
+export(mock_spec)
+export(mock_spec_categorical)
+export(mock_spec_continuous)
+export(mock_spec_date)
 export(parse_range_notation)
 export(parse_variable_start)
 export(read_mock_data_config)
@@ -31,6 +36,7 @@ export(read_mock_data_config_details)
 export(sample_with_proportions)
 export(validate_mock_data_config)
 export(validate_mock_data_config_details)
+export(validate_mock_spec)
 export(validate_mockdata_metadata)
 importFrom(stats,rexp)
 importFrom(stats,rnorm)
diff --git a/R/mock_spec.R b/R/mock_spec.R
new file mode 100644
index 0000000..69b8a0a
--- /dev/null
+++ b/R/mock_spec.R
@@ -0,0 +1,477 @@
+# ==============================================================================
+# MockData v0.4 Specification Layer
+# ==============================================================================
+# Normalized internal representation for direct APIs, recodeflow adapters, and
+# optional generation backends.
+# ==============================================================================
+
+.mock_spec_version <- "0.4.0"
+
+.mock_spec_model_hints <- c(
+  "auto",
+  "native",
+  "simstudy",
+  "native-postprocess",
+  "simstudy-or-native",
+  "simstudy-advanced",
+  "diagnostic-required"
+)
+
+`%||%` <- function(x, y) {
+  if (is.null(x)) y else x
+}
+
+.normalize_provenance <- function(provenance, source = NULL) {
+  if (is.null(provenance)) {
+    provenance <- list(adapter = "direct", source = source %||% "direct")
+  } else if (!is.list(provenance)) {
+    provenance <- list(adapter = as.character(provenance), source = source %||% as.character(provenance))
+  }
+
+  if (is.null(provenance$adapter) || is.na(provenance$adapter) || provenance$adapter == "") {
+    provenance$adapter <- "unknown"
+  }
+  if (is.null(provenance$source) || is.na(provenance$source) || provenance$source == "") {
+    provenance$source <- provenance$adapter
+  }
+
+  provenance
+}
+
+.validate_model_hint <- function(model_hint) {
+  if (length(model_hint) != 1 || is.na(model_hint) || !model_hint %in% .mock_spec_model_hints) {
+    stop(
+      "model_hint must be one of: ",
+      paste(.mock_spec_model_hints, collapse = ", "),
+      call. = FALSE
+    )
+  }
+
+  invisible(TRUE)
+}
+
+.new_mock_spec_variable <- function(name,
+                                    type,
+                                    rtype,
+                                    distribution = NULL,
+                                    range = NULL,
+                                    levels = NULL,
+                                    proportions = NULL,
+                                    formula = NULL,
+                                    missing_codes = character(0),
+                                    missing_proportions = numeric(0),
+                                    garbage_rules = list(),
+                                    source_format = NULL,
+                                    depends_on = character(0),
+                                    provenance = NULL,
+                                    model_hint = "auto",
+                                    ...) {
+  if (!is.character(name) || length(name) != 1 || is.na(name) || trimws(name) == "") {
+    stop("mock_spec variable name must be a non-empty string.", call. = FALSE)
+  }
+  if (!is.character(type) || length(type) != 1 || is.na(type) || trimws(type) == "") {
+    stop("mock_spec variable type must be a non-empty string.", call. = FALSE)
+  }
+
+  .validate_model_hint(model_hint)
+
+  structure(
+    c(
+      list(
+        name = name,
+        type = tolower(type),
+        rtype = tolower(rtype),
+        distribution = distribution,
+        range = range,
+        levels = levels,
+        proportions = proportions,
+        formula = formula,
+        missing_codes = missing_codes,
+        missing_proportions = missing_proportions,
+        garbage_rules = garbage_rules,
+        source_format = source_format,
+        depends_on = depends_on,
+        provenance = .normalize_provenance(provenance),
+        model_hint = model_hint
+      ),
+      list(...)
+    ),
+    class = c("mock_spec_variable", "list")
+  )
+}
+
+.as_mock_spec_variable_list <- function(...) {
+  variables <- list(...)
+
+  if (length(variables) == 1 && is.null(variables[[1]])) {
+    return(list())
+  }
+
+  if (length(variables) == 1 && is.list(variables[[1]]) && !inherits(variables[[1]], "mock_spec_variable")) {
+    variables <- variables[[1]]
+  }
+
+  if (length(variables) == 0) {
+    return(list())
+  }
+
+  if (!all(vapply(variables, inherits, logical(1), what = "mock_spec_variable"))) {
+    stop("mock_spec() inputs must be mock_spec_variable objects.", call. = FALSE)
+  }
+
+  names(variables) <- vapply(variables, `[[`, character(1), "name")
+  variables
+}
+
+#' Create a MockData specification
+#'
+#' `mock_spec()` creates the normalized v0.4 specification object used by the
+#' new architecture. Direct APIs and recodeflow adapters should both normalize
+#' into this shape before validation and generation.
+#'
+#' @param ... `mock_spec_variable` objects, or a single list of them. `NULL`
+#'   creates an empty specification.
+#' @param spec_version Character version of the specification shape.
+#' @param provenance List or character describing where the spec came from.
+#' @param model_hint Character backend hint. One of the supported MockData model
+#'   hints.
+#'
+#' @return S3 object of class `mock_spec`.
+#' @export
+mock_spec <- function(...,
+                      spec_version = .mock_spec_version,
+                      provenance = list(adapter = "direct", source = "direct"),
+                      model_hint = "auto") {
+  .validate_model_hint(model_hint)
+
+  structure(
+    list(
+      spec_version = spec_version,
+      provenance = .normalize_provenance(provenance),
+      model_hint = model_hint,
+      variables = .as_mock_spec_variable_list(...)
+    ),
+    class = c("mock_spec", "list")
+  )
+}
+
+#' Create a continuous variable specification
+#'
+#' @param name Variable name.
+#' @param range Numeric vector of length two giving the inclusive valid range.
+#' @param distribution Distribution name. Defaults to `"uniform"`.
+#' @param mean,sd Optional distribution parameters.
+#' @param rtype R output type. Defaults to `"double"`.
+#' @param missing_codes Explicit missing-code values.
+#' @param missing_proportions Missing-code probabilities aligned to
+#'   `missing_codes`.
+#' @param garbage_rules List of intentional invalid-value rules.
+#' @param provenance Provenance metadata.
+#' @param model_hint Backend hint.
+#'
+#' @return A `mock_spec_variable` object.
+#' @export
+mock_spec_continuous <- function(name,
+                                 range,
+                                 distribution = "uniform",
+                                 mean = NA_real_,
+                                 sd = NA_real_,
+                                 rtype = "double",
+                                 missing_codes = numeric(0),
+                                 missing_proportions = numeric(0),
+                                 garbage_rules = list(),
+                                 provenance = "direct",
+                                 model_hint = "auto") {
+  .new_mock_spec_variable(
+    name = name,
+    type = "continuous",
+    rtype = rtype,
+    distribution = distribution,
+    range = range,
+    mean = mean,
+    sd = sd,
+    missing_codes = missing_codes,
+    missing_proportions = missing_proportions,
+    garbage_rules = garbage_rules,
+    provenance = provenance,
+    model_hint = model_hint
+  )
+}
+
+#' Create a categorical variable specification
+#'
+#' @param name Variable name.
+#' @param levels Character vector of valid levels or codes.
+#' @param proportions Optional probabilities aligned to `levels`.
+#' @param rtype R output type. Defaults to `"factor"`.
+#' @param missing_codes Explicit missing-code values.
+#' @param missing_proportions Missing-code probabilities aligned to
+#'   `missing_codes`.
+#' @param garbage_rules List of intentional invalid-value rules.
+#' @param provenance Provenance metadata.
+#' @param model_hint Backend hint.
+#'
+#' @return A `mock_spec_variable` object.
+#' @export
+mock_spec_categorical <- function(name,
+                                  levels,
+                                  proportions = NULL,
+                                  rtype = "factor",
+                                  missing_codes = character(0),
+                                  missing_proportions = numeric(0),
+                                  garbage_rules = list(),
+                                  provenance = "direct",
+                                  model_hint = "auto") {
+  .new_mock_spec_variable(
+    name = name,
+    type = "categorical",
+    rtype = rtype,
+    distribution = "categorical",
+    levels = levels,
+    proportions = proportions,
+    missing_codes = missing_codes,
+    missing_proportions = missing_proportions,
+    garbage_rules = garbage_rules,
+    provenance = provenance,
+    model_hint = model_hint
+  )
+}
+
+#' Create a date variable specification
+#'
+#' @param name Variable name.
+#' @param range Date vector of length two giving the inclusive valid date range.
+#' @param rtype R output type. Defaults to `"date"`.
+#' @param source_format Source-format hint. Defaults to `"analysis"`.
+#' @param missing_codes Explicit missing-code values.
+#' @param missing_proportions Missing-code probabilities aligned to
+#'   `missing_codes`.
+#' @param garbage_rules List of intentional invalid-value rules.
+#' @param provenance Provenance metadata.
+#' @param model_hint Backend hint.
+#'
+#' @return A `mock_spec_variable` object.
+#' @export
+mock_spec_date <- function(name,
+                           range,
+                           rtype = "date",
+                           source_format = "analysis",
+                           missing_codes = character(0),
+                           missing_proportions = numeric(0),
+                           garbage_rules = list(),
+                           provenance = "direct",
+                           model_hint = "native-postprocess") {
+  .new_mock_spec_variable(
+    name = name,
+    type = "date",
+    rtype = rtype,
+    distribution = "uniform",
+    range = range,
+    source_format = source_format,
+    missing_codes = missing_codes,
+    missing_proportions = missing_proportions,
+    garbage_rules = garbage_rules,
+    provenance = provenance,
+    model_hint = model_hint
+  )
+}
+
+#' Check whether an object is a MockData specification
+#'
+#' @param x Object to check.
+#'
+#' @return Logical scalar.
+#' @export
+is_mock_spec <- function(x) {
+  inherits(x, "mock_spec")
+}
+
+.new_mock_spec_validation_result <- function(valid = TRUE,
+                                             errors = character(0),
+                                             warnings = character(0),
+                                             info = character(0)) {
+  structure(
+    list(
+      valid = valid,
+      errors = errors,
+      warnings = warnings,
+      info = info
+    ),
+    class = c("mock_spec_validation_result", "list")
+  )
+}
+
+.validate_probability_vector <- function(values, label, allow_null = FALSE) {
+  errors <- character(0)
+
+  if (is.null(values)) {
+    if (allow_null) {
+      return(errors)
+    }
+    return(paste0(label, " must not be NULL."))
+  }
+
+  if (!is.numeric(values)) {
+    errors <- c(errors, paste0(label, " must be numeric."))
+  } else {
+    if (any(is.na(values))) {
+      errors <- c(errors, paste0(label, " must not contain NA values."))
+    }
+    if (any(values < 0 | values > 1, na.rm = TRUE)) {
+      errors <- c(errors, paste0(label, " must be between 0 and 1."))
+    }
+  }
+
+  errors
+}
+
+.validate_missing_spec <- function(variable) {
+  errors <- character(0)
+
+  if (length(variable$missing_codes) == 0 && length(variable$missing_proportions) == 0) {
+    return(errors)
+  }
+
+  if (length(variable$missing_codes) != length(variable$missing_proportions)) {
+    errors <- c(errors, paste0(
+      "Variable '", variable$name,
+      "' must have one missing proportion per missing code."
+    ))
+  }
+
+  errors <- c(errors, .validate_probability_vector(
+    variable$missing_proportions,
+    paste0("Variable '", variable$name, "' missing_proportions"),
+    allow_null = FALSE
+  ))
+
+  missing_sum <- sum(variable$missing_proportions, na.rm = TRUE)
+  if (missing_sum > 1) {
+    errors <- c(errors, paste0(
+      "Variable '", variable$name,
+      "' missing proportions must sum to <= 1."
+    ))
+  }
+
+  errors
+}
+
+.validate_range <- function(range, variable_name, expected_class = "numeric") {
+  errors <- character(0)
+
+  if (is.null(range) || length(range) != 2) {
+    return(paste0("Variable '", variable_name, "' range must have length 2."))
+  }
+
+  if (expected_class == "Date") {
+    if (!inherits(range, "Date")) {
+      errors <- c(errors, paste0("Variable '", variable_name, "' range must be Date."))
+    }
+  } else if (!is.numeric(range)) {
+    errors <- c(errors, paste0("Variable '", variable_name, "' range must be numeric."))
+  }
+
+  if (any(is.na(range))) {
+    errors <- c(errors, paste0("Variable '", variable_name, "' range must not contain NA values."))
+  } else if (range[[1]] > range[[2]]) {
+    errors <- c(errors, paste0("Variable '", variable_name, "' range lower bound must be <= upper bound."))
+  }
+
+  errors
+}
+
+.validate_mock_spec_variable <- function(variable) {
+  errors <- character(0)
+
+  if (!inherits(variable, "mock_spec_variable")) {
+    return("All mock_spec variables must inherit from mock_spec_variable.")
+  }
+
+  errors <- c(errors, .validate_missing_spec(variable))
+
+  if (variable$type == "continuous") {
+    errors <- c(errors, .validate_range(variable$range, variable$name, "numeric"))
+    if (identical(variable$distribution, "normal")) {
+      if (is.null(variable$mean) || length(variable$mean) != 1 || is.na(variable$mean)) {
+        errors <- c(errors, paste0("Variable '", variable$name, "' normal distribution requires mean."))
+      }
+      if (is.null(variable$sd) || length(variable$sd) != 1 || is.na(variable$sd) || variable$sd <= 0) {
+        errors <- c(errors, paste0("Variable '", variable$name, "' normal distribution requires sd > 0."))
+      }
+    }
+  } else if (variable$type == "categorical") {
+    if (is.null(variable$levels) || length(variable$levels) == 0) {
+      errors <- c(errors, paste0("Variable '", variable$name, "' must have at least one level."))
+    }
+    if (!is.null(variable$proportions)) {
+      if (length(variable$levels) != length(variable$proportions)) {
+        errors <- c(errors, paste0("Variable '", variable$name, "' must have one proportion per level."))
+      }
+      errors <- c(errors, .validate_probability_vector(
+        variable$proportions,
+        paste0("Variable '", variable$name, "' proportions"),
+        allow_null = FALSE
+      ))
+      prop_sum <- sum(variable$proportions, na.rm = TRUE)
+      if (abs(prop_sum - 1) > 0.001) {
+        errors <- c(errors, paste0("Variable '", variable$name, "' proportions must sum to 1."))
+      }
+    }
+  } else if (variable$type == "date") {
+    errors <- c(errors, .validate_range(variable$range, variable$name, "Date"))
+  } else {
+    errors <- c(errors, paste0("Variable '", variable$name, "' has unsupported type '", variable$type, "'."))
+  }
+
+  errors
+}
+
+#' Validate a MockData specification
+#'
+#' @param spec A `mock_spec` object.
+#' @param n Optional number of rows expected for generation. If supplied, must
+#'   be a non-negative whole number.
+#' @param strict Logical. If `TRUE`, invalid specs throw an error. If `FALSE`,
+#'   a validation result object is returned.
+#'
+#' @return A `mock_spec_validation_result` object when valid or `strict = FALSE`.
+#' @export
+validate_mock_spec <- function(spec, n = NULL, strict = TRUE) {
+  errors <- character(0)
+  warnings <- character(0)
+  info <- character(0)
+
+  if (!is_mock_spec(spec)) {
+    errors <- c(errors, "spec must be a mock_spec object.")
+  } else {
+    if (is.null(spec$spec_version) || length(spec$spec_version) != 1 || is.na(spec$spec_version)) {
+      errors <- c(errors, "mock_spec must have a scalar spec_version.")
+    }
+    if (is.null(spec$variables) || !is.list(spec$variables)) {
+      errors <- c(errors, "mock_spec variables must be a list.")
+    } else {
+      variable_names <- names(spec$variables)
+      if (length(variable_names) != length(unique(variable_names))) {
+        errors <- c(errors, "mock_spec variable names must be unique.")
+      }
+      for (variable in spec$variables) {
+        errors <- c(errors, .validate_mock_spec_variable(variable))
+      }
+    }
+  }
+
+  if (!is.null(n)) {
+    if (!is.numeric(n) || length(n) != 1 || is.na(n) || n < 0 || n != floor(n)) {
+      errors <- c(errors, "n must be a non-negative whole number.")
+    }
+  }
+
+  valid <- length(errors) == 0
+  result <- .new_mock_spec_validation_result(valid, errors, warnings, info)
+
+  if (!valid && isTRUE(strict)) {
+    stop(paste(errors, collapse = "\n"), call. = FALSE)
+  }
+
+  result
+}
diff --git a/development/adr/v04-hybrid-backend.md b/development/adr/v04-hybrid-backend.md
new file mode 100644
index 0000000..eca4420
--- /dev/null
+++ b/development/adr/v04-hybrid-backend.md
@@ -0,0 +1,115 @@
+# ADR: v0.4 Hybrid Backend Architecture
+
+**Status**: draft  
+**Date**: 2026-05-18  
+**Decision owner**: MockData maintainers
+
+## Context
+
+MockData began as an experiment for generating mock testing data from
+recodeflow-style metadata. It now supports categorical, continuous, date,
+garbage-data, and survival-style examples. People are using it, and it is
+becoming part of the recodeflow/cchsflow/chmsflow adoption path.
+
+The v0.3 architecture grew organically. The current generators filter metadata,
+parse ranges, infer generation parameters, generate values, apply missing codes,
+inject garbage, coerce types, and return columns. That made early development
+fast, but it makes cross-variable structure, validation, diagnostics, and
+backend selection harder to reason about.
+
+The v0.4 spike tested whether MockData can normalize user inputs into a
+`mock_spec`, generate through either a native backend or `simstudy`, and keep
+MockData-specific semantics as post-processing. Three review rounds converged on
+the same conclusion: the hybrid architecture is ready for production refactor
+planning.
+
+## Decision
+
+MockData v0.4 will move toward a hybrid backend architecture:
+
+- `mock_spec` is the normalized internal representation.
+- Native MockData generation is the default backend and must work without
+  `simstudy`.
+- `simstudy` is an optional advanced backend for features where it clearly helps,
+  including formula dependencies, correlations, survival durations, and mature
+  simulation mechanics.
+- MockData remains responsible for recodeflow semantics, simple direct APIs,
+  validation, explicit missing-code conventions, garbage/invalid data,
+  diagnostics, date/source-format conversion, and calendar anchoring.
+- `mock_spec` carries `spec_version`, `provenance`, and `model_hint` to preserve
+  adapter agnosticism.
+
+## License And Dependency Posture
+
+MockData remains MIT. `simstudy` is GPL-3, so it will initially be kept optional
+in `Suggests` and accessed through `requireNamespace()`.
+
+Importing `simstudy` as a required dependency would require a conscious future
+governance decision.
+
+## API And Deprecation
+
+Current public functions remain available in v0.4.0:
+
+- `create_mock_data()`
+- `create_cat_var()`
+- `create_con_var()`
+- `create_date_var()`
+- `create_wide_survival_data()`
+
+These functions should become wrappers around the new layered internals where
+possible. They should not be removed in v0.4.0.
+
+Deprecation policy:
+
+- No removal before v0.5.0.
+- Lifecycle deprecation warnings may be added during v0.4.x only after sibling
+  package maintainers have had a migration path.
+- `NEWS.md` must include v0.4.0 migration notes and any deprecation timeline.
+
+## Non-Goals
+
+MockData will not market itself as synthetic data for inference, privacy release,
+or population-valid statistical analysis. It generates mock data for code
+development, QA, documentation, examples, and training.
+
+## Consequences
+
+Positive:
+
+- Recodeflow support remains central.
+- Simple users can use direct APIs without learning `simstudy`.
+- Advanced users can benefit from a mature simulation backend.
+- Missing codes, garbage data, dates, and diagnostics stay MockData-owned.
+- Native generation keeps the package usable where optional dependencies are not
+  installed.
+
+Tradeoffs:
+
+- The package needs a real internal spec model.
+- Backends must be tested for parity where both support the same feature.
+- Some spike contracts need production design: diagnostics, date offsets,
+  formula syntax, custom distribution registry, and correlation merging.
+- Maintaining wrappers will add short-term complexity.
+
+## Implementation Direction
+
+Production refactor should proceed in layers:
+
+1. `mock_spec` constructors and validators.
+2. Direct and recodeflow input adapters.
+3. Formula/dependency evaluator.
+4. Native backend.
+5. Post-processing layer.
+6. Promotion of spike assertions to `testthat`.
+7. Optional `simstudy` backend.
+8. Current API wrappers.
+
+## Open Follow-Up Decisions
+
+- Multi-group correlation merge strategy.
+- Diagnostics object shape.
+- Whether `mock_spec` is internal-only or partially user-facing in v0.4.0.
+- How formula/dependency syntax enters from recodeflow or direct APIs.
+- How Table 1 / summary specifications become a future adapter.
+
diff --git a/development/simstudy-v04.md b/development/simstudy-v04.md
new file mode 100644
index 0000000..5772839
--- /dev/null
+++ b/development/simstudy-v04.md
@@ -0,0 +1,128 @@
+# MockData v0.4 Production Refactor Plan
+
+## 1. Write The ADR First
+
+Write a short architecture decision record before production code changes.
+
+The ADR should lock these decisions:
+
+- **Decision**: MockData adopts a hybrid backend architecture.
+- **Core abstraction**: `mock_spec` is the normalized internal specification.
+- **Forward compatibility**: `mock_spec` carries `spec_version`, `provenance`,
+  and `model_hint` so direct APIs, recodeflow adapters, and future adapters can
+  share one internal representation.
+- **Default backend**: native MockData generation remains the default and must
+  work without `simstudy`.
+- **Optional backend**: `simstudy` is an advanced backend, initially in
+  `Suggests`, gated with `requireNamespace()`.
+- **License posture**: keep MockData MIT by keeping `simstudy` optional unless a
+  future governance decision changes that.
+- **Version target**: v0.4.0.
+- **NEWS commitment**: `NEWS.md` gets a v0.4.0 section with breaking changes,
+  the new spec model, migration notes, and deprecated functions.
+- **Current API timeline**: existing public functions remain as wrappers through
+  v0.4.0. They may be marked lifecycle-deprecated in v0.4.x after sibling
+  packages have migrated, and removed no earlier than v0.5.0.
+- **Non-goal**: MockData remains mock data for code development, QA, and
+  documentation. It is not marketed as synthetic data for inference or privacy
+  release.
+
+## 2. Implement In Layers
+
+Each layer should have focused tests before the next layer starts.
+
+1. **`mock_spec` core**
+   - Constructors and validators.
+   - Stable fields for names, types, ranges, levels, proportions, missing codes,
+     garbage rules, formulas, dates, and backend hints.
+   - Explicit handling for empty specs, `NULL` metadata, single-row specs, and
+     `n = 0`.
+
+2. **Input adapters**
+   - Direct function-argument APIs to `mock_spec`.
+   - Recodeflow `variables` + `variable_details` adapter to `mock_spec`.
+   - Preserve recodeflow semantics as first-class behavior.
+
+3. **Formula/dependency evaluator**
+   - Promote the spike pattern to core: formula referent validation, topological
+     ordering, cycle detection, and sandboxed evaluation in a generated-data
+     environment.
+
+4. **Native backend**
+   - Generate valid baseline values from `mock_spec`.
+   - Keep native support for the simple/core path without `simstudy`.
+   - Add multi-group correlation strategy, including merge behavior with
+     ordinary variables.
+
+5. **Post-processing layer**
+   - Missing codes.
+   - Garbage values.
+   - `rType` coercion.
+   - Date/source-format conversion.
+   - Diagnostics contract.
+   - Replace the legacy `var_row` garbage shim with typed garbage specs.
+
+6. **Spike assertion promotion**
+   - Promote the strongest spike assertions into `testthat`, especially:
+     categorical code/label preservation, missing-code collision diagnostics,
+     seed reproducibility, censor/event date invariants, recEnd-driven
+     missingness, formula dependency validation, and correlation contracts.
+
+7. **Optional `simstudy` backend**
+   - Translate supported `mock_spec` pieces to `simstudy` definitions.
+   - Keep `simstudy` optional with clear errors when unavailable.
+   - Test native/simstudy parity for column names, types, reproducibility, and
+     expected statistical contracts.
+
+8. **Orchestrator wrappers**
+   - Gradually replace current dispatch internals.
+   - Keep `create_mock_data()`, `create_cat_var()`, `create_con_var()`, and
+     `create_date_var()` alive as wrappers during transition.
+
+## 3. Keep The Current API Alive
+
+Existing public functions should remain available in v0.4.0:
+
+- `create_mock_data()`
+- `create_cat_var()`
+- `create_con_var()`
+- `create_date_var()`
+- `create_wide_survival_data()`
+
+These should call the new layered internals where possible. Migration should be
+incremental so cchsflow, chmsflow, and recodeflow users do not need a
+synchronized release.
+
+## 4. Carry-Forward Design Issues
+
+Settle in the ADR or the first design note:
+
+- Multi-group correlation merge strategy.
+- Whether `mock_spec` is internal-only in v0.4.0 or partially user-facing.
+- Diagnostics object shape and stability.
+- `simstudy` dependency posture after governance review.
+- Deprecation schedule for current public wrappers.
+
+Track as implementation issues:
+
+- Empty / `NULL` / single-row input behavior.
+- Date `__offset` convention and whether it remains internal.
+- Garbage `var_row` shim replacement.
+- Distribution registry for custom backend functions.
+- Seed discipline between baseline generation and post-processing.
+- Native vs `simstudy` backend equivalence tests.
+- Event/censoring rate tests with two-sided bounds.
+- Table 1 / summary-spec source as a future adapter.
+
+## 5. Communication
+
+Before v0.4.0 lands, write a short communication note for cchsflow, chmsflow,
+and recodeflow maintainers:
+
+- What changes.
+- What does not change.
+- Which functions remain available.
+- What migration is optional in v0.4.0.
+- When deprecation warnings may begin.
+- How the mock-data framing remains distinct from synthetic-data release.
+
diff --git a/development/v04-simstudy-spike/README.md b/development/v04-simstudy-spike/README.md
deleted file mode 100644
index 42ca5c1..0000000
--- a/development/v04-simstudy-spike/README.md
+++ /dev/null
@@ -1,155 +0,0 @@
-# MockData v0.4 simstudy Spike
-
-This directory contains a disposable architecture spike for MockData v0.4.
-It is intentionally outside the package build and is excluded by
-`.Rbuildignore`.
-
-The working thesis:
-
-> MockData remains the recodeflow-native, MIT-licensed interface for practical
-> mock data. `simstudy` is evaluated as an optional advanced engine for cases
-> where it clearly improves robustness, performance, or modeling capability.
-
-## Run
-
-Install `simstudy` into a temporary library, then run:
-
-```r
-lib <- "/private/tmp/mockdata-simstudy-lib"
-dir.create(lib, recursive = TRUE, showWarnings = FALSE)
-install.packages("simstudy", lib = lib, repos = "https://cloud.r-project.org")
-
-source("development/v04-simstudy-spike/prototype.R")
-```
-
-The prototype expects that temporary library path by default. It does not
-modify `DESCRIPTION`, `renv.lock`, or the package library.
-
-To exercise the native fallback path without `simstudy`, point the temporary
-library variable at an empty path:
-
-```sh
-MOCKDATA_SIMSTUDY_LIB=/private/tmp/no-simstudy-lib \
-  Rscript --vanilla development/v04-simstudy-spike/prototype.R
-```
-
-## What This Prototype Tests
-
-- Recodeflow-style metadata can normalize into a small internal `mock_spec`.
-- The same `mock_spec` can translate to `simstudy::defData()` definitions for
-  age, smoking, interview date offsets, and one formula dependency.
-- `simstudy` can preserve recodeflow categorical codes via categorical levels,
-  and both backends can also generate non-numeric categorical labels such as
-  `"never"`, `"former"`, and `"current"`.
-- Truncated normal age generation can be handled with a `simstudy` custom
-  distribution while MockData still owns range parsing and rType coercion.
-- MockData-style explicit missing codes and garbage values can remain
-  post-processing after baseline valid-value generation.
-- Correlated height/weight generation is straightforward through
-  `simstudy::genCorData()` when correlation parameters are declared in
-  `mock_spec` and translated through the backend definition layer.
-- Survival durations can be generated through `simstudy::defSurv()` /
-  `simstudy::genSurv()` and then anchored back to MockData-owned calendar
-  dates.
-- The same `mock_spec` can drive a native MockData-style generation path when
-  `simstudy` is absent.
-- Missing-code collisions are detectable if post-processing preserves assignment
-  diagnostics. This matters when a valid drawn value can equal an explicit
-  missing code.
-- Seed reproducibility can be asserted across native and `simstudy` paths.
-- Formula dependencies can be validated for missing referents and sorted so
-  formula variables are generated after their inputs.
-- Truncated-normal boundary collapse now fails loudly instead of returning
-  `NaN`.
-
-## Early Read
-
-This first pass supports the hybrid design:
-
-- `simstudy` looks strong as a generation engine.
-- MockData should still own recodeflow semantics, direct simple APIs, validation,
-  missing-code conventions, garbage data, source formats, and calendar anchoring.
-- The normalized `mock_spec` abstraction is useful enough to keep testing.
-
-Open questions remain around dependency/license posture, error wrapping,
-structural constraints, and whether ordinary simple variables should use native
-MockData generation or route through `simstudy`.
-
-## License and Dependency Posture
-
-`simstudy` is GPL-3. MockData is currently MIT. This spike treats `simstudy` as
-an optional advanced backend rather than a required package dependency. A future
-architecture decision needs to explicitly decide whether:
-
-- MockData keeps `simstudy` in `Suggests` with a soft `requireNamespace()` gate.
-- MockData imports `simstudy` and accepts the license/governance implications.
-- MockData keeps a native engine and only borrows design ideas from `simstudy`.
-
-The current prototype supports the first option technically: the native fallback
-path runs without loading `simstudy`.
-
-## Prototype Contracts Surfaced
-
-- `model_hint` is currently a small enum in the prototype rather than an
-  unrestricted string.
-- `provenance` is stored as structured metadata and displayed compactly in the
-  printed spec table.
-- `mockdata_diagnostics` is the prototype mechanism for preserving assignment
-  state after post-processing. This is what lets MockData distinguish a value
-  that was drawn as valid from the same value assigned as an explicit missing
-  code.
-- Survival date columns use mutually exclusive `event_date` and `censor_date`
-  values. Event rows do not also receive a censoring date.
-
-## Review Gaps Addressed After First PR Review
-
-- Added `spec_version`, `provenance`, and `model_hint` fields.
-- Added a fail-loud `from_linkml()` placeholder to keep a future third adapter
-  visible without pretending it is implemented.
-- Added native backend generation from the same `mock_spec`.
-- Added a missing-code collision case where `97` can be both a valid generated
-  value and an assigned missing code.
-- Moved the height/weight correlation example into a `mock_spec` declaration.
-- Added seed reproducibility assertions.
-
-## Review Gaps Addressed After Second PR Review
-
-- Fixed the survival censoring semantic bug where event rows also had
-  `censor_date` populated.
-- Routed correlation parameters through `mock_spec` and the backend definition
-  layer, with both `simstudy` and native Cholesky-based generation paths.
-- Added formula referent validation and dependency ordering.
-- Added statistical-contract assertions for truncated normal moments,
-  categorical proportions, garbage rates, and correlation marginals.
-- Added non-numeric categorical label coverage.
-
-## Remaining Questions Before Production Refactor
-
-- Whether `mockdata_diagnostics` should become a formal internal contract or a
-  different diagnostics object.
-- How much of the prototype `mock_spec` should become user-facing for advanced
-  users.
-- Whether formula/dependency syntax should come from recodeflow metadata,
-  direct MockData arguments, or a future third adapter.
-- Whether `simstudy` remains a `Suggests` backend long term or becomes a
-  stronger package dependency after governance review.
-
-## Internal Contracts Not Yet Generalized
-
-The prototype deliberately encodes a few implementation contracts that are
-useful evidence, but not production-ready API decisions:
-
-- Date variables use hidden `__offset` companion columns during generation, then
-  convert those offsets to calendar dates in post-processing.
-- Custom `simstudy` distributions are resolved by global function name, as with
-  `mockdata_rtrunc_norm`; production code likely needs an explicit distribution
-  registry.
-- Formula variables are added directly to `mock_spec` in the spike rather than
-  parsed from recodeflow metadata.
-- Garbage post-processing still rebuilds a row-shaped `var_row` object to reuse
-  the v0.3 `apply_garbage()` helper.
-- The prototype uses `seed + 1` for post-processing so baseline generation and
-  missing/garbage assignment are reproducible but distinct.
-- Correlated variables currently use a separate backend path from ordinary
-  `defData()` generation; production code needs a merge strategy for multiple
-  correlation groups and ordinary variables.
diff --git a/development/v04-simstudy-spike/prototype.R b/development/v04-simstudy-spike/prototype.R
deleted file mode 100644
index da5bbd3..0000000
--- a/development/v04-simstudy-spike/prototype.R
+++ /dev/null
@@ -1,1071 +0,0 @@
-# MockData v0.4 simstudy architecture spike.
-#
-# This is deliberately a prototype, not package code. It tests whether a small
-# normalized mock_spec can sit between recodeflow metadata and simstudy.
-
-local({
-  spike_lib <- Sys.getenv("MOCKDATA_SIMSTUDY_LIB", "/private/tmp/mockdata-simstudy-lib")
-  if (dir.exists(spike_lib)) {
-    .libPaths(c(spike_lib, .libPaths()))
-  }
-})
-
-simstudy_available <- requireNamespace("simstudy", quietly = TRUE)
-
-source("R/mockdata-parsers.R", local = TRUE)
-source("R/mockdata_helpers.R", local = TRUE)
-
-MODEL_HINTS <- c(
-  "hybrid",
-  "auto",
-  "native-postprocess",
-  "simstudy-or-native",
-  "simstudy-advanced",
-  "diagnostic-required"
-)
-
-`%||%` <- function(x, y) {
-  if (is.null(x)) y else x
-}
-
-mockdata_rtrunc_norm <- function(n, min, max, mu, s) {
-  if (any(!is.finite(min)) || any(!is.finite(max)) || any(min >= max)) {
-    stop("Truncated normal requires finite min < max.", call. = FALSE)
-  }
-
-  f_min <- stats::pnorm(min, mean = mu, sd = s)
-  f_max <- stats::pnorm(max, mean = mu, sd = s)
-  if (any(!is.finite(f_min)) || any(!is.finite(f_max)) || any(f_min >= f_max)) {
-    stop("Truncated normal bounds collapse to an empty probability interval.", call. = FALSE)
-  }
-
-  stats::qnorm(stats::runif(n, min = f_min, max = f_max), mean = mu, sd = s)
-}
-
-new_mock_spec <- function(vars,
-                          spec_version = "0.4-spike-1",
-                          provenance = list(adapter = "mixed", source = "prototype"),
-                          model_hint = "hybrid",
-                          correlation_groups = list()) {
-  validate_model_hint(model_hint)
-
-  structure(
-    vars,
-    class = "mock_spec",
-    spec_version = spec_version,
-    provenance = provenance,
-    model_hint = model_hint,
-    correlation_groups = correlation_groups
-  )
-}
-
-add_spec_metadata <- function(var,
-                              spec_version = "0.4-spike-1",
-                              provenance = "direct",
-                              model_hint = "auto") {
-  validate_model_hint(model_hint)
-
-  var$spec_version <- spec_version
-  var$provenance <- normalize_provenance(provenance)
-  var$model_hint <- model_hint
-  var
-}
-
-normalize_provenance <- function(provenance) {
-  if (is.list(provenance)) {
-    return(provenance)
-  }
-
-  list(adapter = provenance, source = provenance)
-}
-
-format_provenance <- function(provenance) {
-  provenance <- normalize_provenance(provenance)
-  values <- unique(unname(unlist(provenance, use.names = FALSE)))
-  paste(values, collapse = "/")
-}
-
-validate_model_hint <- function(model_hint) {
-  if (!model_hint %in% MODEL_HINTS) {
-    stop(
-      "Unknown model_hint: ", model_hint,
-      ". Expected one of: ", paste(MODEL_HINTS, collapse = ", "),
-      call. = FALSE
-    )
-  }
-
-  invisible(TRUE)
-}
-
-mock_spec_continuous <- function(name,
-                                 range,
-                                 distribution = "uniform",
-                                 mean = NA_real_,
-                                 sd = NA_real_,
-                                 rtype = "double",
-                                 missing_codes = numeric(0),
-                                 missing_proportions = numeric(0),
-                                 garbage = list(),
-                                 source = "direct",
-                                 provenance = source,
-                                 model_hint = "auto",
-                                 correlation_group = NA_character_) {
-  add_spec_metadata(list(
-    name = name,
-    type = "continuous",
-    rtype = rtype,
-    distribution = distribution,
-    range = range,
-    mean = mean,
-    sd = sd,
-    levels = NULL,
-    proportions = NULL,
-    formula = NULL,
-    missing_codes = missing_codes,
-    missing_proportions = missing_proportions,
-    garbage = garbage,
-    source = source,
-    correlation_group = correlation_group
-  ), provenance = provenance, model_hint = model_hint)
-}
-
-mock_spec_categorical <- function(name,
-                                  levels,
-                                  proportions = NULL,
-                                  rtype = "factor",
-                                  missing_codes = character(0),
-                                  missing_proportions = numeric(0),
-                                  garbage = list(),
-                                  source = "direct",
-                                  provenance = source,
-                                  model_hint = "auto") {
-  if (is.null(proportions)) {
-    proportions <- rep(1 / length(levels), length(levels))
-  }
-
-  add_spec_metadata(list(
-    name = name,
-    type = "categorical",
-    rtype = rtype,
-    distribution = "categorical",
-    range = NULL,
-    mean = NA_real_,
-    sd = NA_real_,
-    levels = levels,
-    proportions = proportions,
-    formula = NULL,
-    missing_codes = missing_codes,
-    missing_proportions = missing_proportions,
-    garbage = garbage,
-    source = source
-  ), provenance = provenance, model_hint = model_hint)
-}
-
-mock_spec_date <- function(name,
-                           range,
-                           rtype = "date",
-                           source_format = "analysis",
-                           source = "direct",
-                           provenance = source,
-                           model_hint = "native-postprocess") {
-  add_spec_metadata(list(
-    name = name,
-    type = "date",
-    rtype = rtype,
-    distribution = "uniform",
-    range = range,
-    mean = NA_real_,
-    sd = NA_real_,
-    levels = NULL,
-    proportions = NULL,
-    formula = NULL,
-    missing_codes = character(0),
-    missing_proportions = numeric(0),
-    garbage = list(),
-    source_format = source_format,
-    source = source
-  ), provenance = provenance, model_hint = model_hint)
-}
-
-mock_spec_binary_formula <- function(name, formula, rtype = "integer") {
-  add_spec_metadata(list(
-    name = name,
-    type = "binary_formula",
-    rtype = rtype,
-    distribution = "binary",
-    range = c(0, 1),
-    mean = NA_real_,
-    sd = NA_real_,
-    levels = NULL,
-    proportions = NULL,
-    formula = formula,
-    missing_codes = character(0),
-    missing_proportions = numeric(0),
-    garbage = list(),
-    source = "formula"
-  ), provenance = "formula", model_hint = "simstudy-or-native")
-}
-
-mock_spec_correlated_continuous <- function(name,
-                                            mean,
-                                            sd,
-                                            correlation_group,
-                                            range = c(-Inf, Inf),
-                                            rtype = "double") {
-  mock_spec_continuous(
-    name = name,
-    range = range,
-    distribution = "correlated_normal",
-    mean = mean,
-    sd = sd,
-    rtype = rtype,
-    source = "correlation_spec",
-    provenance = "direct",
-    model_hint = "simstudy-advanced",
-    correlation_group = correlation_group
-  )
-}
-
-spec_table <- function(spec) {
-  data.frame(
-    name = vapply(spec, `[[`, character(1), "name"),
-    type = vapply(spec, `[[`, character(1), "type"),
-    rtype = vapply(spec, `[[`, character(1), "rtype"),
-    distribution = vapply(spec, `[[`, character(1), "distribution"),
-    source = vapply(spec, `[[`, character(1), "source"),
-    provenance = vapply(spec, function(x) format_provenance(x$provenance), character(1)),
-    model_hint = vapply(spec, `[[`, character(1), "model_hint"),
-    stringsAsFactors = FALSE
-  )
-}
-
-first_range <- function(details_subset) {
-  for (value in details_subset$recStart) {
-    parsed <- parse_range_notation(value)
-    if (!is.null(parsed) && parsed$type %in% c("integer", "continuous", "date")) {
-      return(c(parsed$min, parsed$max))
-    }
-  }
-
-  NULL
-}
-
-extract_missing <- function(details_subset) {
-  missing_rows <- details_subset[
-    grepl("^NA::", details_subset$recEnd %||% "", ignore.case = TRUE),
-  ]
-
-  if (nrow(missing_rows) == 0) {
-    return(list(codes = character(0), proportions = numeric(0)))
-  }
-
-  props <- missing_rows$proportion
-  props[is.na(props)] <- 0
-  list(
-    codes = stats::setNames(missing_rows$recStart, missing_rows$recStart),
-    proportions = stats::setNames(props, missing_rows$recStart)
-  )
-}
-
-extract_garbage <- function(var_row) {
-  fields <- c(
-    "garbage_low_prop", "garbage_low_range",
-    "garbage_high_prop", "garbage_high_range"
-  )
-  fields <- fields[fields %in% names(var_row)]
-  stats::setNames(as.list(var_row[1, fields, drop = TRUE]), fields)
-}
-
-as_mock_spec_from_recodeflow <- function(variables, variable_details, databaseStart) {
-  out <- list()
-
-  for (i in seq_len(nrow(variables))) {
-    var_row <- variables[i, ]
-    name <- var_row$variable
-    details_subset <- variable_details[
-      variable_details$variable == name &
-        .database_start_matches(variable_details$databaseStart, databaseStart, allow_empty = TRUE),
-    ]
-
-    type <- tolower(var_row$variableType)
-    rtype <- tolower(var_row$rType)
-    missing <- extract_missing(details_subset)
-    garbage <- extract_garbage(var_row)
-
-    if (type %in% c("continuous", "integer", "numeric")) {
-      out[[name]] <- mock_spec_continuous(
-        name = name,
-        range = first_range(details_subset),
-        distribution = if ("distribution" %in% names(var_row)) var_row$distribution else "uniform",
-        mean = if ("mean" %in% names(var_row)) var_row$mean else NA_real_,
-        sd = if ("sd" %in% names(var_row)) var_row$sd else NA_real_,
-        rtype = rtype,
-        missing_codes = missing$codes,
-        missing_proportions = missing$proportions,
-        garbage = garbage,
-        source = "recodeflow"
-      )
-    } else if (type %in% c("categorical", "factor")) {
-      props <- extract_proportions(details_subset, name)
-      missing_codes <- stats::setNames(names(props$missing), names(props$missing))
-      out[[name]] <- mock_spec_categorical(
-        name = name,
-        levels = props$categories,
-        proportions = props$category_proportions,
-        rtype = rtype,
-        missing_codes = missing_codes,
-        missing_proportions = unlist(props$missing, use.names = TRUE),
-        garbage = garbage,
-        source = "recodeflow"
-      )
-    } else if (type == "date") {
-      out[[name]] <- mock_spec_date(
-        name = name,
-        range = first_range(details_subset),
-        rtype = rtype,
-        source = "recodeflow"
-      )
-    } else {
-      stop("Unsupported prototype variableType: ", var_row$variableType, call. = FALSE)
-    }
-  }
-
-  new_mock_spec(out)
-}
-
-simstudy_formula <- function(x) {
-  paste(x, collapse = ";")
-}
-
-formula_dependencies <- function(var) {
-  if (is.null(var$formula) || is.na(var$formula)) {
-    return(character(0))
-  }
-
-  all.vars(str2lang(var$formula))
-}
-
-order_spec_by_dependencies <- function(spec) {
-  remaining <- names(spec)
-  ordered <- character(0)
-
-  while (length(remaining) > 0) {
-    progressed <- FALSE
-
-    for (name in remaining) {
-      deps <- intersect(formula_dependencies(spec[[name]]), names(spec))
-      if (all(deps %in% ordered)) {
-        ordered <- c(ordered, name)
-        remaining <- setdiff(remaining, name)
-        progressed <- TRUE
-      }
-    }
-
-    if (!progressed) {
-      stop(
-        "Formula dependency cycle or unresolved ordering among: ",
-        paste(remaining, collapse = ", "),
-        call. = FALSE
-      )
-    }
-  }
-
-  new_mock_spec(
-    spec[ordered],
-    spec_version = attr(spec, "spec_version"),
-    provenance = attr(spec, "provenance"),
-    model_hint = attr(spec, "model_hint"),
-    correlation_groups = attr(spec, "correlation_groups") %||% list()
-  )
-}
-
-validate_formula_referents <- function(spec) {
-  spec_names <- names(spec)
-
-  for (var in spec) {
-    missing <- setdiff(formula_dependencies(var), spec_names)
-    if (length(missing) > 0) {
-      stop(
-        "Formula for variable '", var$name, "' references unknown variable(s): ",
-        paste(missing, collapse = ", "),
-        call. = FALSE
-      )
-    }
-  }
-
-  invisible(TRUE)
-}
-
-correlation_defs_from_spec <- function(spec) {
-  groups <- attr(spec, "correlation_groups") %||% list()
-  group_names <- unique(na.omit(vapply(
-    spec,
-    function(var) var$correlation_group %||% NA_character_,
-    character(1)
-  )))
-
-  lapply(stats::setNames(group_names, group_names), function(group_name) {
-    vars <- spec[vapply(
-      spec,
-      function(var) identical(var$correlation_group %||% NA_character_, group_name),
-      logical(1)
-    )]
-    config <- groups[[group_name]] %||% list(rho = 0, corstr = "cs")
-
-    list(
-      group = group_name,
-      names = vapply(vars, `[[`, character(1), "name"),
-      means = vapply(vars, `[[`, numeric(1), "mean"),
-      sds = vapply(vars, `[[`, numeric(1), "sd"),
-      rho = config$rho %||% 0,
-      corstr = config$corstr %||% "cs"
-    )
-  })
-}
-
-as_simstudy_def <- function(spec) {
-  validate_formula_referents(spec)
-  spec <- order_spec_by_dependencies(spec)
-  def <- NULL
-
-  for (var in spec) {
-    if (identical(var$distribution, "correlated_normal")) {
-      next
-    }
-
-    if (!simstudy_available) {
-      stop("simstudy is not available; use backend = 'native' for this spike.", call. = FALSE)
-    }
-
-    if (var$type == "continuous") {
-      range <- var$range
-      if (var$distribution == "normal") {
-        def <- simstudy::defData(
-          def,
-          varname = var$name,
-          formula = "mockdata_rtrunc_norm",
-          variance = paste0(
-            "min = ", range[[1]],
-            ", max = ", range[[2]],
-            ", mu = ", var$mean,
-            ", s = ", var$sd
-          ),
-          dist = "custom"
-        )
-      } else if (var$rtype == "integer") {
-        def <- simstudy::defData(
-          def,
-          varname = var$name,
-          formula = simstudy_formula(range),
-          dist = "uniformInt"
-        )
-      } else {
-        def <- simstudy::defData(
-          def,
-          varname = var$name,
-          formula = simstudy_formula(range),
-          dist = "uniform"
-        )
-      }
-    } else if (var$type == "categorical") {
-      def <- simstudy::defData(
-        def,
-        varname = var$name,
-        formula = simstudy_formula(var$proportions),
-        variance = simstudy_formula(var$levels),
-        dist = "categorical"
-      )
-    } else if (var$type == "date") {
-      days <- as.integer(var$range[[2]] - var$range[[1]])
-      def <- simstudy::defData(
-        def,
-        varname = paste0(var$name, "__offset"),
-        formula = paste0("0;", days),
-        dist = "uniformInt"
-      )
-    } else if (var$type == "binary_formula") {
-      def <- simstudy::defData(
-        def,
-        varname = var$name,
-        formula = var$formula,
-        dist = "binary",
-        link = "logit"
-      )
-    }
-  }
-
-  structure(
-    list(data_def = def, correlation_groups = correlation_defs_from_spec(spec)),
-    class = "mock_simstudy_def"
-  )
-}
-
-generate_mock_data_simstudy <- function(spec, n, seed = NULL) {
-  if (!simstudy_available) {
-    stop("simstudy is not available; use generate_mock_data_native().", call. = FALSE)
-  }
-
-  if (!is.null(seed)) {
-    set.seed(seed)
-  }
-
-  sim_def <- as_simstudy_def(spec)
-  simstudy::genData(n, sim_def$data_def)
-}
-
-generate_mock_data_native <- function(spec, n, seed = NULL) {
-  validate_formula_referents(spec)
-  spec <- order_spec_by_dependencies(spec)
-
-  if (!is.null(seed)) {
-    set.seed(seed)
-  }
-
-  data <- data.frame(id = seq_len(n))
-
-  for (var in spec) {
-    if (var$type == "continuous") {
-      if (identical(var$distribution, "normal")) {
-        data[[var$name]] <- mockdata_rtrunc_norm(
-          n,
-          min = var$range[[1]],
-          max = var$range[[2]],
-          mu = var$mean,
-          s = var$sd
-        )
-      } else if (var$rtype == "integer") {
-        data[[var$name]] <- sample(seq(var$range[[1]], var$range[[2]]), n, replace = TRUE)
-      } else {
-        data[[var$name]] <- stats::runif(n, var$range[[1]], var$range[[2]])
-      }
-    } else if (var$type == "categorical") {
-      data[[var$name]] <- sample(
-        var$levels,
-        n,
-        replace = TRUE,
-        prob = var$proportions
-      )
-    } else if (var$type == "date") {
-      days <- as.integer(var$range[[2]] - var$range[[1]])
-      data[[paste0(var$name, "__offset")]] <- sample(0:days, n, replace = TRUE)
-    } else if (var$type == "binary_formula") {
-      linear_predictor <- eval(str2expression(var$formula), envir = data, enclos = parent.frame())
-      data[[var$name]] <- stats::rbinom(n, size = 1, prob = stats::plogis(linear_predictor))
-    }
-  }
-
-  data
-}
-
-generate_correlated_simstudy <- function(sim_def, n, seed = NULL) {
-  if (!simstudy_available) {
-    stop("simstudy is not available; use generate_correlated_native().", call. = FALSE)
-  }
-
-  if (!is.null(seed)) {
-    set.seed(seed)
-  }
-
-  group <- sim_def$correlation_groups[[1]]
-  as.data.frame(simstudy::genCorData(
-    n,
-    mu = group$means,
-    sigma = group$sds,
-    rho = group$rho,
-    corstr = group$corstr,
-    cnames = group$names
-  ))
-}
-
-generate_correlated_native <- function(sim_def, n, seed = NULL) {
-  if (!is.null(seed)) {
-    set.seed(seed)
-  }
-
-  group <- sim_def$correlation_groups[[1]]
-  n_vars <- length(group$names)
-  cor_matrix <- matrix(group$rho, nrow = n_vars, ncol = n_vars)
-  diag(cor_matrix) <- 1
-  z <- matrix(stats::rnorm(n * n_vars), nrow = n)
-  values <- z %*% chol(cor_matrix)
-  values <- sweep(values, 2, group$sds, `*`)
-  values <- sweep(values, 2, group$means, `+`)
-  out <- as.data.frame(values)
-  names(out) <- group$names
-  out$id <- seq_len(n)
-  out[c("id", group$names)]
-}
-
-inject_missing_codes <- function(values,
-                                 missing_codes,
-                                 missing_proportions,
-                                 seed = NULL,
-                                 return_assignment = FALSE) {
-  assignment <- rep("valid", length(values))
-
-  if (length(missing_codes) == 0 || sum(missing_proportions, na.rm = TRUE) <= 0) {
-    if (return_assignment) {
-      return(list(values = values, assignment = assignment))
-    }
-    return(values)
-  }
-
-  if (!is.null(seed)) {
-    set.seed(seed)
-  }
-
-  missing_proportions[is.na(missing_proportions)] <- 0
-  valid_prop <- max(0, 1 - sum(missing_proportions))
-  assignment <- sample(
-    c("valid", names(missing_proportions)),
-    length(values),
-    replace = TRUE,
-    prob = c(valid_prop, missing_proportions)
-  )
-
-  values <- apply_missing_codes(values, assignment, as.list(missing_codes))
-
-  if (return_assignment) {
-    return(list(values = values, assignment = assignment))
-  }
-
-  values
-}
-
-coerce_mock_rtype <- function(values, rtype) {
-  switch(
-    rtype,
-    integer = as.integer(round(as.numeric(values))),
-    numeric = as.numeric(values),
-    double = as.double(values),
-    factor = factor(values),
-    character = as.character(values),
-    date = as.Date(values),
-    values
-  )
-}
-
-postprocess_mock_data <- function(data, spec, seed = NULL) {
-  data <- as.data.frame(data)
-  diagnostics <- list(missing_assignments = list())
-
-  for (var in spec) {
-    if (var$type == "date") {
-      offset_name <- paste0(var$name, "__offset")
-      data[[var$name]] <- var$range[[1]] + data[[offset_name]]
-      data[[offset_name]] <- NULL
-    }
-
-    if (!var$name %in% names(data)) {
-      next
-    }
-
-    missing_result <- inject_missing_codes(
-      data[[var$name]],
-      missing_codes = var$missing_codes,
-      missing_proportions = var$missing_proportions,
-      seed = seed,
-      return_assignment = TRUE
-    )
-    data[[var$name]] <- missing_result$values
-    diagnostics$missing_assignments[[var$name]] <- missing_result$assignment
-
-    if (length(var$garbage) > 0) {
-      var_row <- as.data.frame(var$garbage, stringsAsFactors = FALSE)
-      data[[var$name]] <- apply_garbage(
-        data[[var$name]],
-        var_row = var_row,
-        variable_type = var$rtype,
-        missing_codes = unname(unlist(var$missing_codes, use.names = FALSE)),
-        seed = seed
-      )
-    }
-
-    data[[var$name]] <- coerce_mock_rtype(data[[var$name]], var$rtype)
-  }
-
-  attr(data, "mockdata_diagnostics") <- diagnostics
-  data
-}
-
-example_recodeflow_metadata <- function() {
-  variables <- data.frame(
-    variable = c("age", "smoking", "interview_date"),
-    variableType = c("continuous", "categorical", "date"),
-    rType = c("integer", "integer", "date"),
-    distribution = c("normal", NA, "uniform"),
-    mean = c(50, NA, NA),
-    sd = c(15, NA, NA),
-    garbage_low_prop = c(0.02, NA, NA),
-    garbage_low_range = c("[0,17]", NA, NA),
-    garbage_high_prop = c(0.02, NA, NA),
-    garbage_high_range = c("[101,115]", NA, NA),
-    stringsAsFactors = FALSE
-  )
-
-  variable_details <- data.frame(
-    variable = c(
-      "age", "age", "age",
-      "smoking", "smoking", "smoking", "smoking",
-      "interview_date"
-    ),
-    databaseStart = "minimal-example",
-    recStart = c(
-      "[18,100]", "997", "998",
-      "1", "2", "3", "7",
-      "[2001-01-01;2005-12-31]"
-    ),
-    recEnd = c(
-      "valid", "NA::b", "NA::b",
-      "valid", "valid", "valid", "NA::b",
-      "valid"
-    ),
-    proportion = c(
-      NA, 0.02, 0.01,
-      0.50, 0.30, 0.17, 0.03,
-      NA
-    ),
-    catLabel = c(
-      NA, "don't know", "refused",
-      "never", "former", "current", "don't know",
-      NA
-    ),
-    stringsAsFactors = FALSE
-  )
-
-  list(variables = variables, variable_details = variable_details)
-}
-
-run_correlated_height_weight <- function(n = 1000,
-                                         seed = 123,
-                                         backend = c("simstudy", "native")) {
-  backend <- match.arg(backend)
-  correlation_spec <- new_mock_spec(list(
-    height_cm = mock_spec_correlated_continuous(
-      "height_cm",
-      mean = 170,
-      sd = 10,
-      correlation_group = "body_size"
-    ),
-    weight_kg = mock_spec_correlated_continuous(
-      "weight_kg",
-      mean = 78,
-      sd = 16,
-      correlation_group = "body_size"
-    )
-  ),
-  provenance = list(adapter = "direct", source = "correlation prototype"),
-  correlation_groups = list(body_size = list(rho = 0.65, corstr = "cs")))
-
-  sim_def <- as_simstudy_def(correlation_spec)
-  out <- if (backend == "simstudy") {
-    generate_correlated_simstudy(sim_def, n = n, seed = seed)
-  } else {
-    generate_correlated_native(sim_def, n = n, seed = seed)
-  }
-
-  list(spec = correlation_spec, simstudy_def = sim_def, data = out)
-}
-
-run_survival_anchor <- function(n = 1000, seed = 123) {
-  if (!simstudy_available) {
-    stop("simstudy is not available; survival generation is an advanced backend test.", call. = FALSE)
-  }
-
-  set.seed(seed)
-  base_def <- simstudy::defData(varname = "exposed", formula = 0.40, dist = "binary")
-  surv_def <- simstudy::defSurv(
-    varname = "event_time",
-    formula = "0.3 * exposed",
-    scale = 600,
-    shape = 1
-  )
-  surv_def <- simstudy::defSurv(surv_def, varname = "censor_time", scale = 1500, shape = 1)
-
-  data <- simstudy::genData(n, base_def)
-  data <- simstudy::genSurv(
-    data,
-    surv_def,
-    timeName = "followup_days",
-    censorName = "censor_time",
-    eventName = "event"
-  )
-
-  data <- as.data.frame(data)
-  entry_start <- as.Date("2001-01-01")
-  data$entry_date <- entry_start + sample(0:365, n, replace = TRUE)
-  data$event_date <- as.Date(NA)
-  data$censor_date <- as.Date(NA)
-  event_idx <- which(data$event == 1)
-  censor_idx <- which(data$event == 0)
-  data$event_date[event_idx] <- data$entry_date[event_idx] + round(data$followup_days[event_idx])
-  data$censor_date[censor_idx] <- data$entry_date[censor_idx] + round(data$followup_days[censor_idx])
-  data
-}
-
-run_spike <- function(n = 1000, seed = 123, backend = c("simstudy", "native")) {
-  backend <- match.arg(backend)
-
-  metadata <- example_recodeflow_metadata()
-  spec <- as_mock_spec_from_recodeflow(
-    metadata$variables,
-    metadata$variable_details,
-    databaseStart = "minimal-example"
-  )
-  spec[["high_visits"]] <- mock_spec_binary_formula(
-    "high_visits",
-    "-4 + 0.04 * age + 0.8 * (smoking == 3)"
-  )
-
-  baseline <- if (backend == "simstudy") {
-    generate_mock_data_simstudy(spec, n = n, seed = seed)
-  } else {
-    generate_mock_data_native(spec, n = n, seed = seed)
-  }
-  final <- postprocess_mock_data(baseline, spec, seed = seed + 1)
-  correlated <- if (backend == "simstudy") run_correlated_height_weight(n, seed = seed) else NULL
-  survival <- if (backend == "simstudy") run_survival_anchor(n, seed = seed) else NULL
-
-  list(
-    backend = backend,
-    spec = spec,
-    spec_table = spec_table(spec),
-    simstudy_def = if (backend == "simstudy") as_simstudy_def(spec) else NULL,
-    baseline = as.data.frame(baseline),
-    final = final,
-    correlated = correlated,
-    survival = survival
-  )
-}
-
-assert_spike <- function(result) {
-  spec <- result$spec
-  baseline <- result$baseline
-  final <- result$final
-  diagnostics <- attr(final, "mockdata_diagnostics")
-
-  stopifnot(identical(attr(spec, "spec_version"), "0.4-spike-1"))
-  stopifnot(identical(attr(spec, "model_hint"), "hybrid"))
-  stopifnot(isTRUE(all.equal(as.numeric(spec$age$range), c(18, 100))))
-  stopifnot(identical(spec$smoking$levels, c("1", "2", "3")))
-  if (result$backend == "simstudy") {
-    stopifnot(inherits(result$simstudy_def, "mock_simstudy_def"))
-    stopifnot("data.table" %in% class(result$simstudy_def$data_def))
-  }
-  stopifnot(all(c("age", "smoking", "interview_date", "high_visits") %in% names(final)))
-  stopifnot(all(baseline$age >= 18 & baseline$age <= 100))
-  stopifnot(abs(mean(baseline$age) - 50) < 2)
-  stopifnot(abs(stats::sd(baseline$age) - 15) < 3)
-  stopifnot(is.integer(final$age))
-  stopifnot(all(baseline$smoking %in% c(1, 2, 3)))
-  smoking_props <- prop.table(table(baseline$smoking))
-  stopifnot(abs(unname(smoking_props["1"]) - 0.50) < 0.07)
-  stopifnot(abs(unname(smoking_props["2"]) - 0.30) < 0.07)
-  stopifnot(abs(unname(smoking_props["3"]) - 0.20) < 0.07)
-  stopifnot(all(final$high_visits %in% c(0, 1)))
-  stopifnot(any(final$high_visits == 1))
-  stopifnot(inherits(final$interview_date, "Date"))
-  stopifnot(all(final$interview_date >= as.Date("2001-01-01")))
-  stopifnot(all(final$interview_date <= as.Date("2005-12-31")))
-  stopifnot(any(final$age %in% c(997L, 998L)))
-  stopifnot(any(final$age < 18L | final$age > 100L))
-  age_valid_assignment <- diagnostics$missing_assignments$age == "valid"
-  age_garbage_rate <- mean(
-    (final$age < 18L | final$age > 100L) & age_valid_assignment,
-    na.rm = TRUE
-  )
-  stopifnot(age_garbage_rate > 0.02)
-  stopifnot(age_garbage_rate < 0.06)
-  stopifnot(any(final$smoking == 7L))
-  stopifnot(any(diagnostics$missing_assignments$smoking == "7"))
-  stopifnot(abs(mean(diagnostics$missing_assignments$smoking == "7") - 0.03) < 0.03)
-
-  if (result$backend == "simstudy") {
-    correlated <- result$correlated$data
-    correlation_spec <- result$correlated$spec
-    survival <- result$survival
-
-    stopifnot(all(vapply(correlation_spec, `[[`, character(1), "correlation_group") == "body_size"))
-    stopifnot(inherits(result$correlated$simstudy_def, "mock_simstudy_def"))
-    stopifnot(abs(stats::cor(correlated$height_cm, correlated$weight_kg) - 0.65) < 0.08)
-    stopifnot(abs(mean(correlated$height_cm) - 170) < 2)
-    stopifnot(abs(mean(correlated$weight_kg) - 78) < 3)
-    stopifnot(abs(stats::sd(correlated$height_cm) - 10) < 2)
-    stopifnot(abs(stats::sd(correlated$weight_kg) - 16) < 3)
-    stopifnot(all(survival$followup_days >= 0))
-    stopifnot(any(survival$event == 1))
-    stopifnot(all(is.na(survival$event_date) | survival$event_date >= survival$entry_date))
-    stopifnot(all(is.na(survival$censor_date) | survival$censor_date >= survival$entry_date))
-    stopifnot(!any(!is.na(survival$event_date) & !is.na(survival$censor_date)))
-  }
-
-  invisible(TRUE)
-}
-
-assert_native_fallback <- function(n = 1000, seed = 123) {
-  native_result <- run_spike(n = n, seed = seed, backend = "native")
-  final <- native_result$final
-
-  stopifnot(is.null(native_result$simstudy_def))
-  stopifnot(all(c("age", "smoking", "interview_date", "high_visits") %in% names(final)))
-  stopifnot(is.integer(final$age))
-  stopifnot(inherits(final$interview_date, "Date"))
-  stopifnot(any(final$age %in% c(997L, 998L)))
-  stopifnot(any(final$smoking == 7L))
-
-  invisible(native_result)
-}
-
-assert_missing_collision_case <- function(seed = 123) {
-  spec <- new_mock_spec(list(
-    collision_code = mock_spec_categorical(
-      name = "collision_code",
-      levels = c("1", "2", "97", "99"),
-      proportions = c(0.20, 0.20, 0.50, 0.10),
-      rtype = "integer",
-      missing_codes = c("97" = "97"),
-      missing_proportions = c("97" = 0.10),
-      provenance = "collision-test",
-      model_hint = "diagnostic-required"
-    )
-  ))
-
-  baseline <- generate_mock_data_native(spec, n = 1000, seed = seed)
-  final <- postprocess_mock_data(baseline, spec, seed = seed + 1)
-  assignment <- attr(final, "mockdata_diagnostics")$missing_assignments$collision_code
-
-  stopifnot(any(baseline$collision_code == "97"))
-  stopifnot(any(final$collision_code == 97L & assignment == "valid"))
-  stopifnot(any(final$collision_code == 97L & assignment == "97"))
-
-  invisible(final)
-}
-
-assert_non_numeric_categorical_labels <- function(seed = 123) {
-  spec <- new_mock_spec(list(
-    smoking_label = mock_spec_categorical(
-      name = "smoking_label",
-      levels = c("never", "former", "current"),
-      proportions = c(0.50, 0.30, 0.20),
-      rtype = "character",
-      provenance = "label-test",
-      model_hint = "simstudy-or-native"
-    )
-  ))
-
-  native <- postprocess_mock_data(
-    generate_mock_data_native(spec, n = 1000, seed = seed),
-    spec,
-    seed = seed + 1
-  )
-  stopifnot(all(native$smoking_label %in% c("never", "former", "current")))
-
-  if (simstudy_available) {
-    sim <- postprocess_mock_data(
-      generate_mock_data_simstudy(spec, n = 1000, seed = seed),
-      spec,
-      seed = seed + 1
-    )
-    stopifnot(all(sim$smoking_label %in% c("never", "former", "current")))
-  }
-
-  invisible(TRUE)
-}
-
-assert_formula_dependency_validation <- function() {
-  spec <- new_mock_spec(list(
-    outcome = mock_spec_binary_formula("outcome", "-1 + missing_predictor")
-  ))
-
-  error <- tryCatch(
-    {
-      validate_formula_referents(spec)
-      NULL
-    },
-    error = conditionMessage
-  )
-  stopifnot(grepl("missing_predictor", error))
-
-  unordered <- new_mock_spec(list(
-    outcome = mock_spec_binary_formula("outcome", "-4 + 0.04 * age"),
-    age = mock_spec_continuous(
-      "age",
-      range = c(18, 100),
-      distribution = "normal",
-      mean = 50,
-      sd = 15,
-      rtype = "integer"
-    )
-  ))
-  ordered <- order_spec_by_dependencies(unordered)
-  stopifnot(identical(names(ordered), c("age", "outcome")))
-
-  invisible(TRUE)
-}
-
-assert_truncated_normal_boundaries <- function() {
-  error <- tryCatch(
-    {
-      mockdata_rtrunc_norm(10, min = 5, max = 5, mu = 5, s = 1)
-      NULL
-    },
-    error = conditionMessage
-  )
-  stopifnot(grepl("min < max", error))
-
-  invisible(TRUE)
-}
-
-assert_seed_reproducibility <- function() {
-  first <- run_spike(seed = 123, backend = "native")$final
-  second <- run_spike(seed = 123, backend = "native")$final
-  stopifnot(identical(first, second))
-
-  if (simstudy_available) {
-    first_simstudy <- run_spike(seed = 123, backend = "simstudy")$final
-    second_simstudy <- run_spike(seed = 123, backend = "simstudy")$final
-    stopifnot(identical(first_simstudy, second_simstudy))
-  }
-
-  invisible(TRUE)
-}
-
-from_linkml <- function(...) {
-  stop(
-    "from_linkml() is a forward-compatibility placeholder for a future ",
-    "third input adapter; it is not implemented in this spike.",
-    call. = FALSE
-  )
-}
-
-spike_result <- if (simstudy_available) {
-  run_spike(backend = "simstudy")
-} else {
-  message("simstudy is not available; running native fallback assertions only.")
-  run_spike(backend = "native")
-}
-assert_spike(spike_result)
-native_result <- assert_native_fallback()
-collision_result <- assert_missing_collision_case()
-assert_non_numeric_categorical_labels()
-assert_formula_dependency_validation()
-assert_truncated_normal_boundaries()
-assert_seed_reproducibility()
-
-cat("MockData v0.4 simstudy spike passed.\n\n")
-print(spike_result$spec_table)
-cat("\nGenerated data preview:\n")
-print(utils::head(spike_result$final))
-if (simstudy_available) {
-  cat("\nCorrelated height/weight correlation:\n")
-  print(stats::cor(
-    spike_result$correlated$data$height_cm,
-    spike_result$correlated$data$weight_kg
-  ))
-  cat("\nSurvival preview:\n")
-  print(utils::head(spike_result$survival[c("id", "exposed", "followup_days", "event", "entry_date", "event_date", "censor_date")]))
-}
-cat("\nNative fallback preview:\n")
-print(utils::head(native_result$final))
-cat("\nMissing-code collision preview:\n")
-print(utils::head(collision_result))
diff --git a/tests/testthat/test-mock-spec.R b/tests/testthat/test-mock-spec.R
new file mode 100644
index 0000000..bc8de92
--- /dev/null
+++ b/tests/testthat/test-mock-spec.R
@@ -0,0 +1,143 @@
+test_that("mock_spec creates an empty specification", {
+  spec <- mock_spec()
+
+  expect_s3_class(spec, "mock_spec")
+  expect_true(is_mock_spec(spec))
+  expect_equal(spec$spec_version, "0.4.0")
+  expect_equal(length(spec$variables), 0)
+  expect_true(validate_mock_spec(spec)$valid)
+})
+
+test_that("mock_spec accepts NULL as an empty specification", {
+  spec <- mock_spec(NULL)
+
+  expect_s3_class(spec, "mock_spec")
+  expect_equal(length(spec$variables), 0)
+  expect_true(validate_mock_spec(spec, n = 0)$valid)
+})
+
+test_that("mock_spec supports single continuous variable specs", {
+  age <- mock_spec_continuous(
+    name = "age",
+    range = c(18, 85),
+    distribution = "normal",
+    mean = 50,
+    sd = 12,
+    rtype = "integer",
+    missing_codes = c(997, 998),
+    missing_proportions = c(0.02, 0.01)
+  )
+  spec <- mock_spec(age)
+
+  expect_s3_class(age, "mock_spec_variable")
+  expect_named(spec$variables, "age")
+  expect_equal(spec$variables$age$type, "continuous")
+  expect_equal(spec$variables$age$range, c(18, 85))
+  expect_equal(spec$variables$age$provenance$adapter, "direct")
+  expect_true(validate_mock_spec(spec, n = 1)$valid)
+})
+
+test_that("mock_spec supports categorical variable specs", {
+  smoking <- mock_spec_categorical(
+    name = "smoking",
+    levels = c("never", "former", "current"),
+    proportions = c(0.5, 0.3, 0.2),
+    rtype = "character"
+  )
+  spec <- mock_spec(list(smoking))
+
+  expect_named(spec$variables, "smoking")
+  expect_equal(spec$variables$smoking$type, "categorical")
+  expect_equal(spec$variables$smoking$levels, c("never", "former", "current"))
+  expect_equal(spec$variables$smoking$proportions, c(0.5, 0.3, 0.2))
+  expect_true(validate_mock_spec(spec)$valid)
+})
+
+test_that("mock_spec supports date variable specs", {
+  interview_date <- mock_spec_date(
+    name = "interview_date",
+    range = as.Date(c("2001-01-01", "2005-12-31")),
+    source_format = "analysis"
+  )
+  spec <- mock_spec(interview_date)
+
+  expect_named(spec$variables, "interview_date")
+  expect_equal(spec$variables$interview_date$type, "date")
+  expect_s3_class(spec$variables$interview_date$range, "Date")
+  expect_equal(spec$variables$interview_date$model_hint, "native-postprocess")
+  expect_true(validate_mock_spec(spec, n = 0)$valid)
+})
+
+test_that("mock_spec validates n as a non-negative whole number", {
+  spec <- mock_spec()
+
+  expect_true(validate_mock_spec(spec, n = 0)$valid)
+  expect_error(validate_mock_spec(spec, n = -1), "non-negative whole number")
+  expect_error(validate_mock_spec(spec, n = 1.5), "non-negative whole number")
+  expect_error(validate_mock_spec(spec, n = NA_real_), "non-negative whole number")
+})
+
+test_that("validate_mock_spec returns structured errors when strict is FALSE", {
+  spec <- mock_spec(mock_spec_categorical(
+    name = "smoking",
+    levels = c("never", "former", "current"),
+    proportions = c(0.5, 0.3)
+  ))
+
+  result <- validate_mock_spec(spec, strict = FALSE)
+
+  expect_false(result$valid)
+  expect_true(any(grepl("one proportion per level", result$errors)))
+})
+
+test_that("validate_mock_spec catches malformed continuous and date ranges", {
+  bad_continuous <- mock_spec(mock_spec_continuous(
+    name = "age",
+    range = c(85, 18)
+  ))
+  expect_error(validate_mock_spec(bad_continuous), "lower bound")
+
+  bad_date <- mock_spec(mock_spec_date(
+    name = "interview_date",
+    range = c("2001-01-01", "2005-12-31")
+  ))
+  expect_error(validate_mock_spec(bad_date), "range must be Date")
+})
+
+test_that("validate_mock_spec catches normal distribution parameter errors", {
+  bad_normal <- mock_spec(mock_spec_continuous(
+    name = "age",
+    range = c(18, 85),
+    distribution = "normal",
+    mean = 50,
+    sd = 0
+  ))
+
+  expect_error(validate_mock_spec(bad_normal), "sd > 0")
+})
+
+test_that("mock_spec rejects duplicate variable names", {
+  spec <- mock_spec(
+    mock_spec_continuous("age", range = c(18, 85)),
+    mock_spec_continuous("age", range = c(0, 100))
+  )
+
+  result <- validate_mock_spec(spec, strict = FALSE)
+
+  expect_false(result$valid)
+  expect_true(any(grepl("unique", result$errors)))
+})
+
+test_that("mock_spec validates model hints", {
+  expect_error(
+    mock_spec_continuous("age", range = c(18, 85), model_hint = "magic"),
+    "model_hint"
+  )
+})
+
+test_that("validate_mock_spec rejects non-spec objects", {
+  result <- validate_mock_spec(list(), strict = FALSE)
+
+  expect_false(result$valid)
+  expect_true(any(grepl("mock_spec", result$errors)))
+})

From 75484a3f6a001b59559241d4adf9e8fd48f897cc Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 07:04:04 -0400
Subject: [PATCH 02/18] Address mock_spec milestone review

---
 NAMESPACE                       |   1 +
 NEWS.md                         |  12 ++++
 R/mock_spec.R                   | 118 +++++++++++++++++++++++++++++++-
 tests/testthat/test-mock-spec.R |  66 ++++++++++++++++--
 4 files changed, 190 insertions(+), 7 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 33c9e9a..0c73a0a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(print,mock_spec_validation_result)
 S3method(print,mockdata_validation_result)
 export(add_garbage)
 export(apply_garbage)
diff --git a/NEWS.md b/NEWS.md
index 3c5a00a..b67598b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,15 @@
+# MockData 0.4.0
+
+## Development
+
+- Started the v0.4 production refactor around a normalized `mock_spec`
+  architecture.
+- Added `mock_spec()`, `mock_spec_continuous()`, `mock_spec_categorical()`,
+  `mock_spec_date()`, `is_mock_spec()`, and `validate_mock_spec()`.
+- Added forward-compatible specification fields: `spec_version`, `provenance`,
+  and `model_hint`.
+- Existing v0.3 generator APIs remain available while v0.4 internals are built.
+
 # MockData 0.3.0
 
 ## Breaking changes
diff --git a/R/mock_spec.R b/R/mock_spec.R
index 69b8a0a..96ed932 100644
--- a/R/mock_spec.R
+++ b/R/mock_spec.R
@@ -17,6 +17,29 @@
   "diagnostic-required"
 )
 
+#' Model hints for MockData specifications
+#'
+#' Model hints are lightweight backend guidance carried by `mock_spec` objects
+#' and variables. They are not generation commands; generation backends may use
+#' them to choose a sensible default path.
+#'
+#' Supported values:
+#' \describe{
+#'   \item{`auto`}{Let MockData choose the backend.}
+#'   \item{`native`}{Prefer the native MockData backend.}
+#'   \item{`simstudy`}{Prefer the optional `simstudy` backend.}
+#'   \item{`native-postprocess`}{Generate baseline values natively, then rely
+#'   on MockData post-processing such as date/source-format conversion.}
+#'   \item{`simstudy-or-native`}{Either backend is expected to be suitable.}
+#'   \item{`simstudy-advanced`}{Feature is expected to need advanced `simstudy`
+#'   support, such as correlations or survival durations.}
+#'   \item{`diagnostic-required`}{Generation/post-processing must preserve
+#'   diagnostics needed to interpret the result.}
+#' }
+#'
+#' @name mock_spec_model_hints
+NULL
+
 `%||%` <- function(x, y) {
   if (is.null(x)) y else x
 }
@@ -135,16 +158,31 @@
 #' @param provenance List or character describing where the spec came from.
 #' @param model_hint Character backend hint. One of the supported MockData model
 #'   hints.
+#' @param validate Logical. If `TRUE`, validate the constructed specification
+#'   before returning it.
 #'
 #' @return S3 object of class `mock_spec`.
+#'
+#' @examples
+#' spec <- mock_spec(
+#'   mock_spec_continuous("age", range = c(18, 85), rtype = "integer"),
+#'   mock_spec_categorical(
+#'     "smoking",
+#'     levels = c("never", "former", "current"),
+#'     proportions = c(0.5, 0.3, 0.2)
+#'   )
+#' )
+#' validate_mock_spec(spec)
+#'
 #' @export
 mock_spec <- function(...,
                       spec_version = .mock_spec_version,
                       provenance = list(adapter = "direct", source = "direct"),
-                      model_hint = "auto") {
+                      model_hint = "auto",
+                      validate = TRUE) {
   .validate_model_hint(model_hint)
 
-  structure(
+  spec <- structure(
     list(
       spec_version = spec_version,
       provenance = .normalize_provenance(provenance),
@@ -153,6 +191,12 @@ mock_spec <- function(...,
     ),
     class = c("mock_spec", "list")
   )
+
+  if (isTRUE(validate)) {
+    validate_mock_spec(spec, strict = TRUE)
+  }
+
+  spec
 }
 
 #' Create a continuous variable specification
@@ -170,6 +214,17 @@ mock_spec <- function(...,
 #' @param model_hint Backend hint.
 #'
 #' @return A `mock_spec_variable` object.
+#'
+#' @examples
+#' age <- mock_spec_continuous(
+#'   "age",
+#'   range = c(18, 85),
+#'   distribution = "normal",
+#'   mean = 50,
+#'   sd = 12,
+#'   rtype = "integer"
+#' )
+#'
 #' @export
 mock_spec_continuous <- function(name,
                                  range,
@@ -212,6 +267,15 @@ mock_spec_continuous <- function(name,
 #' @param model_hint Backend hint.
 #'
 #' @return A `mock_spec_variable` object.
+#'
+#' @examples
+#' smoking <- mock_spec_categorical(
+#'   "smoking",
+#'   levels = c("never", "former", "current"),
+#'   proportions = c(0.5, 0.3, 0.2),
+#'   rtype = "character"
+#' )
+#'
 #' @export
 mock_spec_categorical <- function(name,
                                   levels,
@@ -251,6 +315,13 @@ mock_spec_categorical <- function(name,
 #' @param model_hint Backend hint.
 #'
 #' @return A `mock_spec_variable` object.
+#'
+#' @examples
+#' interview_date <- mock_spec_date(
+#'   "interview_date",
+#'   range = as.Date(c("2001-01-01", "2005-12-31"))
+#' )
+#'
 #' @export
 mock_spec_date <- function(name,
                            range,
@@ -281,6 +352,12 @@ mock_spec_date <- function(name,
 #' @param x Object to check.
 #'
 #' @return Logical scalar.
+#'
+#' @examples
+#' spec <- mock_spec()
+#' is_mock_spec(spec)
+#' is_mock_spec(list())
+#'
 #' @export
 is_mock_spec <- function(x) {
   inherits(x, "mock_spec")
@@ -301,6 +378,35 @@ is_mock_spec <- function(x) {
   )
 }
 
+#' @export
+print.mock_spec_validation_result <- function(x, ...) {
+  status <- if (isTRUE(x$valid)) "valid" else "invalid"
+  cat("MockData mock_spec validation result: ", status, "\n", sep = "")
+
+  if (length(x$errors) > 0) {
+    cat("\nErrors:\n")
+    for (i in seq_along(x$errors)) {
+      cat(i, ". ", x$errors[[i]], "\n", sep = "")
+    }
+  }
+
+  if (length(x$warnings) > 0) {
+    cat("\nWarnings:\n")
+    for (i in seq_along(x$warnings)) {
+      cat(i, ". ", x$warnings[[i]], "\n", sep = "")
+    }
+  }
+
+  if (length(x$info) > 0) {
+    cat("\nInfo:\n")
+    for (i in seq_along(x$info)) {
+      cat(i, ". ", x$info[[i]], "\n", sep = "")
+    }
+  }
+
+  invisible(x)
+}
+
 .validate_probability_vector <- function(values, label, allow_null = FALSE) {
   errors <- character(0)
 
@@ -435,6 +541,14 @@ is_mock_spec <- function(x) {
 #'   a validation result object is returned.
 #'
 #' @return A `mock_spec_validation_result` object when valid or `strict = FALSE`.
+#'
+#' @examples
+#' spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85)))
+#' validate_mock_spec(spec)
+#'
+#' result <- validate_mock_spec(list(), strict = FALSE)
+#' result$valid
+#'
 #' @export
 validate_mock_spec <- function(spec, n = NULL, strict = TRUE) {
   errors <- character(0)
diff --git a/tests/testthat/test-mock-spec.R b/tests/testthat/test-mock-spec.R
index bc8de92..edc3f41 100644
--- a/tests/testthat/test-mock-spec.R
+++ b/tests/testthat/test-mock-spec.R
@@ -82,7 +82,7 @@ test_that("validate_mock_spec returns structured errors when strict is FALSE", {
     name = "smoking",
     levels = c("never", "former", "current"),
     proportions = c(0.5, 0.3)
-  ))
+  ), validate = FALSE)
 
   result <- validate_mock_spec(spec, strict = FALSE)
 
@@ -94,13 +94,13 @@ test_that("validate_mock_spec catches malformed continuous and date ranges", {
   bad_continuous <- mock_spec(mock_spec_continuous(
     name = "age",
     range = c(85, 18)
-  ))
+  ), validate = FALSE)
   expect_error(validate_mock_spec(bad_continuous), "lower bound")
 
   bad_date <- mock_spec(mock_spec_date(
     name = "interview_date",
     range = c("2001-01-01", "2005-12-31")
-  ))
+  ), validate = FALSE)
   expect_error(validate_mock_spec(bad_date), "range must be Date")
 })
 
@@ -111,7 +111,7 @@ test_that("validate_mock_spec catches normal distribution parameter errors", {
     distribution = "normal",
     mean = 50,
     sd = 0
-  ))
+  ), validate = FALSE)
 
   expect_error(validate_mock_spec(bad_normal), "sd > 0")
 })
@@ -119,7 +119,8 @@ test_that("validate_mock_spec catches normal distribution parameter errors", {
 test_that("mock_spec rejects duplicate variable names", {
   spec <- mock_spec(
     mock_spec_continuous("age", range = c(18, 85)),
-    mock_spec_continuous("age", range = c(0, 100))
+    mock_spec_continuous("age", range = c(0, 100)),
+    validate = FALSE
   )
 
   result <- validate_mock_spec(spec, strict = FALSE)
@@ -141,3 +142,58 @@ test_that("validate_mock_spec rejects non-spec objects", {
   expect_false(result$valid)
   expect_true(any(grepl("mock_spec", result$errors)))
 })
+
+test_that("mock_spec validates on construction by default", {
+  expect_error(
+    mock_spec(mock_spec_categorical(
+      name = "smoking",
+      levels = c("never", "former", "current"),
+      proportions = c(0.5, 0.3)
+    )),
+    "one proportion per level"
+  )
+})
+
+test_that("validate_mock_spec accumulates multiple errors", {
+  spec <- mock_spec(
+    mock_spec_categorical(
+      name = "smoking",
+      levels = character(0),
+      proportions = c(0.5, 0.5)
+    ),
+    validate = FALSE
+  )
+
+  result <- validate_mock_spec(spec, strict = FALSE)
+
+  expect_false(result$valid)
+  expect_true(length(result$errors) >= 2)
+  expect_true(any(grepl("at least one level", result$errors)))
+  expect_true(any(grepl("one proportion per level", result$errors)))
+})
+
+test_that("is_mock_spec returns FALSE for non-spec objects", {
+  expect_false(is_mock_spec(list()))
+  expect_false(is_mock_spec(NULL))
+  expect_false(is_mock_spec(data.frame()))
+})
+
+test_that("mock_spec preserves spec_version and rejects missing spec_version", {
+  spec <- mock_spec(spec_version = "0.4.0-test")
+
+  expect_equal(spec$spec_version, "0.4.0-test")
+
+  spec$spec_version <- NA_character_
+  result <- validate_mock_spec(spec, strict = FALSE)
+
+  expect_false(result$valid)
+  expect_true(any(grepl("spec_version", result$errors)))
+})
+
+test_that("print.mock_spec_validation_result summarizes errors", {
+  result <- validate_mock_spec(list(), strict = FALSE)
+
+  expect_output(print(result), "invalid")
+  expect_output(print(result), "Errors")
+  expect_output(print(result), "spec must be a mock_spec object")
+})

From 33d7aee57323aa4d416b3bff28c1de50fe3fa95e Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 07:11:08 -0400
Subject: [PATCH 03/18] Add direct mock specification helpers

---
 NAMESPACE                             |   3 +
 NEWS.md                               |   3 +
 R/mock_spec.R                         | 191 ++++++++++++++++++++++++++
 tests/testthat/test-direct-mock-api.R |  81 +++++++++++
 4 files changed, 278 insertions(+)
 create mode 100644 tests/testthat/test-direct-mock-api.R

diff --git a/NAMESPACE b/NAMESPACE
index 0c73a0a..cb0f2e3 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -26,6 +26,9 @@ export(identify_derived_vars)
 export(import_from_recodeflow)
 export(is_mock_spec)
 export(make_garbage)
+export(mock_categorical)
+export(mock_continuous)
+export(mock_date)
 export(mock_spec)
 export(mock_spec_categorical)
 export(mock_spec_continuous)
diff --git a/NEWS.md b/NEWS.md
index b67598b..80afcd5 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -6,6 +6,9 @@
   architecture.
 - Added `mock_spec()`, `mock_spec_continuous()`, `mock_spec_categorical()`,
   `mock_spec_date()`, `is_mock_spec()`, and `validate_mock_spec()`.
+- Added direct specification helpers `mock_continuous()`,
+  `mock_categorical()`, and `mock_date()` for simple use without
+  recodeflow-style metadata tables.
 - Added forward-compatible specification fields: `spec_version`, `provenance`,
   and `model_hint`.
 - Existing v0.3 generator APIs remain available while v0.4 internals are built.
diff --git a/R/mock_spec.R b/R/mock_spec.R
index 96ed932..928a5a2 100644
--- a/R/mock_spec.R
+++ b/R/mock_spec.R
@@ -146,6 +146,13 @@ NULL
   variables
 }
 
+.direct_api_provenance <- function(source, provenance = NULL) {
+  .normalize_provenance(
+    provenance %||% list(adapter = "direct", source = source),
+    source = source
+  )
+}
+
 #' Create a MockData specification
 #'
 #' `mock_spec()` creates the normalized v0.4 specification object used by the
@@ -199,6 +206,190 @@ mock_spec <- function(...,
   spec
 }
 
+#' Create a direct continuous mock-data specification
+#'
+#' `mock_continuous()` is the simple direct API for continuous variables. It
+#' returns a validated `mock_spec`; it does not generate data. Generation
+#' backends will consume this specification in a later v0.4 milestone.
+#'
+#' @param name Variable name.
+#' @param range Numeric vector of length two giving the inclusive valid range.
+#' @param distribution Distribution name. Defaults to `"uniform"`.
+#' @param mean,sd Optional distribution parameters. Required when
+#'   `distribution = "normal"`.
+#' @param rtype R output type. Defaults to `"double"`.
+#' @param missing_codes Explicit missing-code values.
+#' @param missing_proportions Missing-code probabilities aligned to
+#'   `missing_codes`.
+#' @param garbage_rules List of intentional invalid-value rules.
+#' @param provenance Optional provenance metadata. Defaults to the direct API.
+#' @param model_hint Backend hint.
+#' @param spec_version Character version of the specification shape.
+#'
+#' @return A validated `mock_spec` object containing one continuous variable.
+#'
+#' @examples
+#' age_spec <- mock_continuous(
+#'   "age",
+#'   range = c(18, 85),
+#'   distribution = "normal",
+#'   mean = 50,
+#'   sd = 12,
+#'   rtype = "integer"
+#' )
+#' validate_mock_spec(age_spec)
+#'
+#' @export
+mock_continuous <- function(name,
+                            range,
+                            distribution = "uniform",
+                            mean = NA_real_,
+                            sd = NA_real_,
+                            rtype = "double",
+                            missing_codes = numeric(0),
+                            missing_proportions = numeric(0),
+                            garbage_rules = list(),
+                            provenance = NULL,
+                            model_hint = "auto",
+                            spec_version = .mock_spec_version) {
+  provenance <- .direct_api_provenance("mock_continuous", provenance)
+
+  mock_spec(
+    mock_spec_continuous(
+      name = name,
+      range = range,
+      distribution = distribution,
+      mean = mean,
+      sd = sd,
+      rtype = rtype,
+      missing_codes = missing_codes,
+      missing_proportions = missing_proportions,
+      garbage_rules = garbage_rules,
+      provenance = provenance,
+      model_hint = model_hint
+    ),
+    spec_version = spec_version,
+    provenance = provenance,
+    model_hint = model_hint
+  )
+}
+
+#' Create a direct categorical mock-data specification
+#'
+#' `mock_categorical()` is the simple direct API for categorical variables. It
+#' returns a validated `mock_spec`; it does not generate data.
+#'
+#' @param name Variable name.
+#' @param levels Character vector of valid levels or codes.
+#' @param proportions Optional probabilities aligned to `levels`.
+#' @param rtype R output type. Defaults to `"factor"`.
+#' @param missing_codes Explicit missing-code values.
+#' @param missing_proportions Missing-code probabilities aligned to
+#'   `missing_codes`.
+#' @param garbage_rules List of intentional invalid-value rules.
+#' @param provenance Optional provenance metadata. Defaults to the direct API.
+#' @param model_hint Backend hint.
+#' @param spec_version Character version of the specification shape.
+#'
+#' @return A validated `mock_spec` object containing one categorical variable.
+#'
+#' @examples
+#' smoking_spec <- mock_categorical(
+#'   "smoking",
+#'   levels = c("never", "former", "current"),
+#'   proportions = c(0.5, 0.3, 0.2),
+#'   rtype = "character"
+#' )
+#' validate_mock_spec(smoking_spec)
+#'
+#' @export
+mock_categorical <- function(name,
+                             levels,
+                             proportions = NULL,
+                             rtype = "factor",
+                             missing_codes = character(0),
+                             missing_proportions = numeric(0),
+                             garbage_rules = list(),
+                             provenance = NULL,
+                             model_hint = "auto",
+                             spec_version = .mock_spec_version) {
+  provenance <- .direct_api_provenance("mock_categorical", provenance)
+
+  mock_spec(
+    mock_spec_categorical(
+      name = name,
+      levels = levels,
+      proportions = proportions,
+      rtype = rtype,
+      missing_codes = missing_codes,
+      missing_proportions = missing_proportions,
+      garbage_rules = garbage_rules,
+      provenance = provenance,
+      model_hint = model_hint
+    ),
+    spec_version = spec_version,
+    provenance = provenance,
+    model_hint = model_hint
+  )
+}
+
+#' Create a direct date mock-data specification
+#'
+#' `mock_date()` is the simple direct API for date variables. It returns a
+#' validated `mock_spec`; it does not generate data.
+#'
+#' @param name Variable name.
+#' @param range Date vector of length two giving the inclusive valid date range.
+#' @param rtype R output type. Defaults to `"date"`.
+#' @param source_format Source-format hint. Defaults to `"analysis"`.
+#' @param missing_codes Explicit missing-code values.
+#' @param missing_proportions Missing-code probabilities aligned to
+#'   `missing_codes`.
+#' @param garbage_rules List of intentional invalid-value rules.
+#' @param provenance Optional provenance metadata. Defaults to the direct API.
+#' @param model_hint Backend hint.
+#' @param spec_version Character version of the specification shape.
+#'
+#' @return A validated `mock_spec` object containing one date variable.
+#'
+#' @examples
+#' interview_date_spec <- mock_date(
+#'   "interview_date",
+#'   range = as.Date(c("2001-01-01", "2005-12-31"))
+#' )
+#' validate_mock_spec(interview_date_spec)
+#'
+#' @export
+mock_date <- function(name,
+                      range,
+                      rtype = "date",
+                      source_format = "analysis",
+                      missing_codes = character(0),
+                      missing_proportions = numeric(0),
+                      garbage_rules = list(),
+                      provenance = NULL,
+                      model_hint = "native-postprocess",
+                      spec_version = .mock_spec_version) {
+  provenance <- .direct_api_provenance("mock_date", provenance)
+
+  mock_spec(
+    mock_spec_date(
+      name = name,
+      range = range,
+      rtype = rtype,
+      source_format = source_format,
+      missing_codes = missing_codes,
+      missing_proportions = missing_proportions,
+      garbage_rules = garbage_rules,
+      provenance = provenance,
+      model_hint = model_hint
+    ),
+    spec_version = spec_version,
+    provenance = provenance,
+    model_hint = model_hint
+  )
+}
+
 #' Create a continuous variable specification
 #'
 #' @param name Variable name.
diff --git a/tests/testthat/test-direct-mock-api.R b/tests/testthat/test-direct-mock-api.R
new file mode 100644
index 0000000..2248282
--- /dev/null
+++ b/tests/testthat/test-direct-mock-api.R
@@ -0,0 +1,81 @@
+test_that("mock_continuous creates a validated one-variable spec", {
+  spec <- mock_continuous(
+    "age",
+    range = c(18, 85),
+    distribution = "normal",
+    mean = 50,
+    sd = 12,
+    rtype = "integer",
+    missing_codes = c(997, 998),
+    missing_proportions = c(0.02, 0.01)
+  )
+
+  expect_s3_class(spec, "mock_spec")
+  expect_named(spec$variables, "age")
+  expect_equal(spec$variables$age$type, "continuous")
+  expect_equal(spec$variables$age$rtype, "integer")
+  expect_equal(spec$provenance$adapter, "direct")
+  expect_equal(spec$provenance$source, "mock_continuous")
+  expect_true(validate_mock_spec(spec)$valid)
+})
+
+test_that("mock_categorical creates a validated one-variable spec", {
+  spec <- mock_categorical(
+    "smoking",
+    levels = c("never", "former", "current"),
+    proportions = c(0.5, 0.3, 0.2),
+    rtype = "character"
+  )
+
+  expect_s3_class(spec, "mock_spec")
+  expect_named(spec$variables, "smoking")
+  expect_equal(spec$variables$smoking$type, "categorical")
+  expect_equal(spec$variables$smoking$levels, c("never", "former", "current"))
+  expect_equal(spec$variables$smoking$proportions, c(0.5, 0.3, 0.2))
+  expect_equal(spec$provenance$source, "mock_categorical")
+  expect_true(validate_mock_spec(spec)$valid)
+})
+
+test_that("mock_date creates a validated one-variable spec", {
+  spec <- mock_date(
+    "interview_date",
+    range = as.Date(c("2001-01-01", "2005-12-31"))
+  )
+
+  expect_s3_class(spec, "mock_spec")
+  expect_named(spec$variables, "interview_date")
+  expect_equal(spec$variables$interview_date$type, "date")
+  expect_s3_class(spec$variables$interview_date$range, "Date")
+  expect_equal(spec$model_hint, "native-postprocess")
+  expect_equal(spec$provenance$source, "mock_date")
+  expect_true(validate_mock_spec(spec)$valid)
+})
+
+test_that("direct mock APIs validate immediately", {
+  expect_error(
+    mock_continuous(
+      "age",
+      range = c(18, 85),
+      distribution = "normal",
+      mean = 50
+    ),
+    "sd > 0"
+  )
+
+  expect_error(
+    mock_categorical(
+      "smoking",
+      levels = c("never", "former", "current"),
+      proportions = c(0.5, 0.3)
+    ),
+    "one proportion per level"
+  )
+
+  expect_error(
+    mock_date(
+      "interview_date",
+      range = c("2001-01-01", "2005-12-31")
+    ),
+    "range must be Date"
+  )
+})

From f29414defb3fd815b47e92dd6ac7b40cfbe45648 Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 09:24:35 -0400
Subject: [PATCH 04/18] Tighten mock_spec auditability contracts

---
 R/mock_spec.R                         | 99 +++++++++++++++++++++++----
 development/adr/v04-hybrid-backend.md |  7 +-
 tests/testthat/test-direct-mock-api.R | 73 ++++++++++++++++++++
 tests/testthat/test-mock-spec.R       | 53 ++++++++++++++
 4 files changed, 219 insertions(+), 13 deletions(-)

diff --git a/R/mock_spec.R b/R/mock_spec.R
index 928a5a2..7fd3e51 100644
--- a/R/mock_spec.R
+++ b/R/mock_spec.R
@@ -7,6 +7,8 @@
 
 .mock_spec_version <- "0.4.0"
 
+.mock_spec_probability_tolerance <- 1e-8
+
 .mock_spec_model_hints <- c(
   "auto",
   "native",
@@ -44,6 +46,10 @@ NULL
   if (is.null(x)) y else x
 }
 
+.is_non_empty_string <- function(x) {
+  is.character(x) && length(x) == 1 && !is.na(x) && nzchar(trimws(x))
+}
+
 .normalize_provenance <- function(provenance, source = NULL) {
   if (is.null(provenance)) {
     provenance <- list(adapter = "direct", source = source %||% "direct")
@@ -51,14 +57,22 @@ NULL
     provenance <- list(adapter = as.character(provenance), source = source %||% as.character(provenance))
   }
 
-  if (is.null(provenance$adapter) || is.na(provenance$adapter) || provenance$adapter == "") {
-    provenance$adapter <- "unknown"
+  if (!.is_non_empty_string(provenance$adapter)) {
+    stop("provenance$adapter must be a non-empty string.", call. = FALSE)
   }
-  if (is.null(provenance$source) || is.na(provenance$source) || provenance$source == "") {
+  if (is.null(provenance$source) && !is.null(source)) {
+    provenance$source <- source
+  } else if (is.null(provenance$source)) {
     provenance$source <- provenance$adapter
   }
+  if (!.is_non_empty_string(provenance$source)) {
+    stop("provenance$source must be a non-empty string.", call. = FALSE)
+  }
 
-  provenance
+  c(
+    list(adapter = provenance$adapter, source = provenance$source),
+    provenance[setdiff(names(provenance), c("adapter", "source"))]
+  )
 }
 
 .validate_model_hint <- function(model_hint) {
@@ -147,10 +161,15 @@ NULL
 }
 
 .direct_api_provenance <- function(source, provenance = NULL) {
-  .normalize_provenance(
-    provenance %||% list(adapter = "direct", source = source),
-    source = source
-  )
+  provenance <- provenance %||% list(source = source)
+
+  if (!is.list(provenance)) {
+    provenance <- list(source = as.character(provenance))
+  }
+  provenance$adapter <- "direct"
+  provenance$source <- provenance$source %||% source
+
+  .normalize_provenance(provenance, source = source)
 }
 
 #' Create a MockData specification
@@ -169,6 +188,8 @@ NULL
 #'   before returning it.
 #'
 #' @return S3 object of class `mock_spec`.
+#' @family mock specification APIs
+#' @seealso [mock_continuous()], [mock_categorical()], [mock_date()]
 #'
 #' @examples
 #' spec <- mock_spec(
@@ -227,6 +248,8 @@ mock_spec <- function(...,
 #' @param spec_version Character version of the specification shape.
 #'
 #' @return A validated `mock_spec` object containing one continuous variable.
+#' @family direct specification APIs
+#' @seealso [mock_spec()], [mock_spec_continuous()]
 #'
 #' @examples
 #' age_spec <- mock_continuous(
@@ -292,6 +315,8 @@ mock_continuous <- function(name,
 #' @param spec_version Character version of the specification shape.
 #'
 #' @return A validated `mock_spec` object containing one categorical variable.
+#' @family direct specification APIs
+#' @seealso [mock_spec()], [mock_spec_categorical()]
 #'
 #' @examples
 #' smoking_spec <- mock_categorical(
@@ -351,6 +376,8 @@ mock_categorical <- function(name,
 #' @param spec_version Character version of the specification shape.
 #'
 #' @return A validated `mock_spec` object containing one date variable.
+#' @family direct specification APIs
+#' @seealso [mock_spec()], [mock_spec_date()]
 #'
 #' @examples
 #' interview_date_spec <- mock_date(
@@ -405,6 +432,8 @@ mock_date <- function(name,
 #' @param model_hint Backend hint.
 #'
 #' @return A `mock_spec_variable` object.
+#' @family mock specification APIs
+#' @seealso [mock_spec()], [mock_continuous()]
 #'
 #' @examples
 #' age <- mock_spec_continuous(
@@ -458,6 +487,8 @@ mock_spec_continuous <- function(name,
 #' @param model_hint Backend hint.
 #'
 #' @return A `mock_spec_variable` object.
+#' @family mock specification APIs
+#' @seealso [mock_spec()], [mock_categorical()]
 #'
 #' @examples
 #' smoking <- mock_spec_categorical(
@@ -506,6 +537,8 @@ mock_spec_categorical <- function(name,
 #' @param model_hint Backend hint.
 #'
 #' @return A `mock_spec_variable` object.
+#' @family mock specification APIs
+#' @seealso [mock_spec()], [mock_date()]
 #'
 #' @examples
 #' interview_date <- mock_spec_date(
@@ -622,6 +655,22 @@ print.mock_spec_validation_result <- function(x, ...) {
   errors
 }
 
+.validate_provenance <- function(provenance, label) {
+  errors <- character(0)
+
+  if (!is.list(provenance)) {
+    return(paste0(label, " provenance must be a list."))
+  }
+  if (!.is_non_empty_string(provenance$adapter)) {
+    errors <- c(errors, paste0(label, " provenance$adapter must be a non-empty string."))
+  }
+  if (!.is_non_empty_string(provenance$source)) {
+    errors <- c(errors, paste0(label, " provenance$source must be a non-empty string."))
+  }
+
+  errors
+}
+
 .validate_missing_spec <- function(variable) {
   errors <- character(0)
 
@@ -643,7 +692,7 @@ print.mock_spec_validation_result <- function(x, ...) {
   ))
 
   missing_sum <- sum(variable$missing_proportions, na.rm = TRUE)
-  if (missing_sum > 1) {
+  if (missing_sum > 1 + .mock_spec_probability_tolerance) {
     errors <- c(errors, paste0(
       "Variable '", variable$name,
       "' missing proportions must sum to <= 1."
@@ -685,6 +734,21 @@ print.mock_spec_validation_result <- function(x, ...) {
   }
 
   errors <- c(errors, .validate_missing_spec(variable))
+  errors <- c(errors, .validate_provenance(
+    variable$provenance,
+    paste0("Variable '", variable$name, "'")
+  ))
+  if (is.null(variable$model_hint) ||
+      length(variable$model_hint) != 1 ||
+      is.na(variable$model_hint) ||
+      !variable$model_hint %in% .mock_spec_model_hints) {
+    errors <- c(errors, paste0(
+      "Variable '", variable$name,
+      "' model_hint must be one of: ",
+      paste(.mock_spec_model_hints, collapse = ", "),
+      "."
+    ))
+  }
 
   if (variable$type == "continuous") {
     errors <- c(errors, .validate_range(variable$range, variable$name, "numeric"))
@@ -710,7 +774,7 @@ print.mock_spec_validation_result <- function(x, ...) {
         allow_null = FALSE
       ))
       prop_sum <- sum(variable$proportions, na.rm = TRUE)
-      if (abs(prop_sum - 1) > 0.001) {
+      if (abs(prop_sum - 1) > .mock_spec_probability_tolerance) {
         errors <- c(errors, paste0("Variable '", variable$name, "' proportions must sum to 1."))
       }
     }
@@ -749,8 +813,19 @@ validate_mock_spec <- function(spec, n = NULL, strict = TRUE) {
   if (!is_mock_spec(spec)) {
     errors <- c(errors, "spec must be a mock_spec object.")
   } else {
-    if (is.null(spec$spec_version) || length(spec$spec_version) != 1 || is.na(spec$spec_version)) {
-      errors <- c(errors, "mock_spec must have a scalar spec_version.")
+    if (!.is_non_empty_string(spec$spec_version)) {
+      errors <- c(errors, "mock_spec must have a non-empty scalar spec_version.")
+    }
+    errors <- c(errors, .validate_provenance(spec$provenance, "mock_spec"))
+    if (is.null(spec$model_hint) ||
+        length(spec$model_hint) != 1 ||
+        is.na(spec$model_hint) ||
+        !spec$model_hint %in% .mock_spec_model_hints) {
+      errors <- c(errors, paste0(
+        "mock_spec model_hint must be one of: ",
+        paste(.mock_spec_model_hints, collapse = ", "),
+        "."
+      ))
     }
     if (is.null(spec$variables) || !is.list(spec$variables)) {
       errors <- c(errors, "mock_spec variables must be a list.")
diff --git a/development/adr/v04-hybrid-backend.md b/development/adr/v04-hybrid-backend.md
index eca4420..63fc032 100644
--- a/development/adr/v04-hybrid-backend.md
+++ b/development/adr/v04-hybrid-backend.md
@@ -112,4 +112,9 @@ Production refactor should proceed in layers:
 - Whether `mock_spec` is internal-only or partially user-facing in v0.4.0.
 - How formula/dependency syntax enters from recodeflow or direct APIs.
 - How Table 1 / summary specifications become a future adapter.
-
+- How the legacy `var_row` shim used by v0.3 garbage helpers is replaced with
+  typed v0.4 post-processing specs.
+- Empty, `NULL`, `n = 0`, and single-row input behavior across adapters and
+  backends.
+- Seed discipline across native generation, post-processing, and the optional
+  `simstudy` backend.
diff --git a/tests/testthat/test-direct-mock-api.R b/tests/testthat/test-direct-mock-api.R
index 2248282..d66c087 100644
--- a/tests/testthat/test-direct-mock-api.R
+++ b/tests/testthat/test-direct-mock-api.R
@@ -16,6 +16,8 @@ test_that("mock_continuous creates a validated one-variable spec", {
   expect_equal(spec$variables$age$rtype, "integer")
   expect_equal(spec$provenance$adapter, "direct")
   expect_equal(spec$provenance$source, "mock_continuous")
+  expect_equal(spec$variables$age$provenance$adapter, "direct")
+  expect_equal(spec$variables$age$provenance$source, "mock_continuous")
   expect_true(validate_mock_spec(spec)$valid)
 })
 
@@ -32,7 +34,10 @@ test_that("mock_categorical creates a validated one-variable spec", {
   expect_equal(spec$variables$smoking$type, "categorical")
   expect_equal(spec$variables$smoking$levels, c("never", "former", "current"))
   expect_equal(spec$variables$smoking$proportions, c(0.5, 0.3, 0.2))
+  expect_equal(spec$provenance$adapter, "direct")
   expect_equal(spec$provenance$source, "mock_categorical")
+  expect_equal(spec$variables$smoking$provenance$adapter, "direct")
+  expect_equal(spec$variables$smoking$provenance$source, "mock_categorical")
   expect_true(validate_mock_spec(spec)$valid)
 })
 
@@ -47,10 +52,78 @@ test_that("mock_date creates a validated one-variable spec", {
   expect_equal(spec$variables$interview_date$type, "date")
   expect_s3_class(spec$variables$interview_date$range, "Date")
   expect_equal(spec$model_hint, "native-postprocess")
+  expect_equal(spec$variables$interview_date$model_hint, "native-postprocess")
+  expect_equal(spec$provenance$adapter, "direct")
   expect_equal(spec$provenance$source, "mock_date")
+  expect_equal(spec$variables$interview_date$provenance$adapter, "direct")
+  expect_equal(spec$variables$interview_date$provenance$source, "mock_date")
   expect_true(validate_mock_spec(spec)$valid)
 })
 
+test_that("direct mock APIs are equivalent to explicit mock_spec wrappers", {
+  continuous_provenance <- list(adapter = "direct", source = "mock_continuous")
+  expect_equal(
+    mock_continuous("age", range = c(18, 85), rtype = "integer"),
+    mock_spec(
+      mock_spec_continuous(
+        "age",
+        range = c(18, 85),
+        rtype = "integer",
+        provenance = continuous_provenance
+      ),
+      provenance = continuous_provenance
+    )
+  )
+
+  categorical_provenance <- list(adapter = "direct", source = "mock_categorical")
+  expect_equal(
+    mock_categorical(
+      "smoking",
+      levels = c("never", "former", "current"),
+      proportions = c(0.5, 0.3, 0.2)
+    ),
+    mock_spec(
+      mock_spec_categorical(
+        "smoking",
+        levels = c("never", "former", "current"),
+        proportions = c(0.5, 0.3, 0.2),
+        provenance = categorical_provenance
+      ),
+      provenance = categorical_provenance
+    )
+  )
+
+  date_provenance <- list(adapter = "direct", source = "mock_date")
+  expect_equal(
+    mock_date(
+      "interview_date",
+      range = as.Date(c("2001-01-01", "2005-12-31"))
+    ),
+    mock_spec(
+      mock_spec_date(
+        "interview_date",
+        range = as.Date(c("2001-01-01", "2005-12-31")),
+        provenance = date_provenance
+      ),
+      provenance = date_provenance,
+      model_hint = "native-postprocess"
+    )
+  )
+})
+
+test_that("direct mock APIs keep adapter provenance fixed as direct", {
+  spec <- mock_continuous(
+    "age",
+    range = c(18, 85),
+    provenance = list(adapter = "not-direct", source = "custom-note")
+  )
+
+  expect_equal(spec$provenance$adapter, "direct")
+  expect_equal(spec$provenance$source, "custom-note")
+  expect_equal(spec$variables$age$provenance$adapter, "direct")
+  expect_equal(spec$variables$age$provenance$source, "custom-note")
+})
+
 test_that("direct mock APIs validate immediately", {
   expect_error(
     mock_continuous(
diff --git a/tests/testthat/test-mock-spec.R b/tests/testthat/test-mock-spec.R
index edc3f41..7d748ef 100644
--- a/tests/testthat/test-mock-spec.R
+++ b/tests/testthat/test-mock-spec.R
@@ -188,6 +188,59 @@ test_that("mock_spec preserves spec_version and rejects missing spec_version", {
 
   expect_false(result$valid)
   expect_true(any(grepl("spec_version", result$errors)))
+
+  spec$spec_version <- ""
+  result <- validate_mock_spec(spec, strict = FALSE)
+
+  expect_false(result$valid)
+  expect_true(any(grepl("spec_version", result$errors)))
+})
+
+test_that("validate_mock_spec checks provenance and model_hint after mutation", {
+  spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85)))
+
+  spec$provenance <- "not-a-list"
+  result <- validate_mock_spec(spec, strict = FALSE)
+  expect_false(result$valid)
+  expect_true(any(grepl("provenance must be a list", result$errors)))
+
+  spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85)))
+  spec$provenance$adapter <- ""
+  result <- validate_mock_spec(spec, strict = FALSE)
+  expect_false(result$valid)
+  expect_true(any(grepl("provenance\\$adapter", result$errors)))
+
+  spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85)))
+  spec$provenance$source <- NA_character_
+  result <- validate_mock_spec(spec, strict = FALSE)
+  expect_false(result$valid)
+  expect_true(any(grepl("provenance\\$source", result$errors)))
+
+  spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85)))
+  spec$model_hint <- "magic"
+  result <- validate_mock_spec(spec, strict = FALSE)
+  expect_false(result$valid)
+  expect_true(any(grepl("mock_spec model_hint", result$errors)))
+
+  spec <- mock_spec(mock_spec_continuous("age", range = c(18, 85)))
+  spec$variables$age$provenance$adapter <- ""
+  spec$variables$age$model_hint <- "magic"
+  result <- validate_mock_spec(spec, strict = FALSE)
+  expect_false(result$valid)
+  expect_true(any(grepl("Variable 'age' provenance\\$adapter", result$errors)))
+  expect_true(any(grepl("Variable 'age' model_hint", result$errors)))
+})
+
+test_that("proportion sums use floating-point tolerance consistently", {
+  spec <- mock_spec(mock_spec_categorical(
+    "smoking",
+    levels = c("never", "former"),
+    proportions = c(0.5, 0.5 + 1e-15),
+    missing_codes = c("7", "9"),
+    missing_proportions = c(0.5, 0.5 + 1e-15)
+  ))
+
+  expect_true(validate_mock_spec(spec)$valid)
 })
 
 test_that("print.mock_spec_validation_result summarizes errors", {

From bd7b389272b40fe5f6890ef85d13f833a2ec5684 Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 09:32:13 -0400
Subject: [PATCH 05/18] Add recodeflow mock_spec adapter

---
 NAMESPACE                                  |   1 +
 NEWS.md                                    |   2 +
 R/mock_spec_recodeflow.R                   | 416 +++++++++++++++++++++
 tests/testthat/test-recodeflow-mock-spec.R | 112 ++++++
 4 files changed, 531 insertions(+)
 create mode 100644 R/mock_spec_recodeflow.R
 create mode 100644 tests/testthat/test-recodeflow-mock-spec.R

diff --git a/NAMESPACE b/NAMESPACE
index cb0f2e3..b1b6f71 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -33,6 +33,7 @@ export(mock_spec)
 export(mock_spec_categorical)
 export(mock_spec_continuous)
 export(mock_spec_date)
+export(mock_spec_from_recodeflow)
 export(parse_range_notation)
 export(parse_variable_start)
 export(read_mock_data_config)
diff --git a/NEWS.md b/NEWS.md
index 80afcd5..9b9d9c5 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,6 +9,8 @@
 - Added direct specification helpers `mock_continuous()`,
   `mock_categorical()`, and `mock_date()` for simple use without
   recodeflow-style metadata tables.
+- Added `mock_spec_from_recodeflow()` to adapt recodeflow-style `variables`
+  and `variable_details` metadata into validated `mock_spec` objects.
 - Added forward-compatible specification fields: `spec_version`, `provenance`,
   and `model_hint`.
 - Existing v0.3 generator APIs remain available while v0.4 internals are built.
diff --git a/R/mock_spec_recodeflow.R b/R/mock_spec_recodeflow.R
new file mode 100644
index 0000000..0274ce5
--- /dev/null
+++ b/R/mock_spec_recodeflow.R
@@ -0,0 +1,416 @@
+# ==============================================================================
+# MockData v0.4 Recodeflow Adapter
+# ==============================================================================
+# Converts recodeflow-style variables and variable_details metadata into the
+# normalized mock_spec representation.
+# ==============================================================================
+
+.read_recodeflow_table <- function(x, label) {
+  if (is.data.frame(x)) {
+    return(x)
+  }
+
+  if (is.character(x) && length(x) == 1) {
+    if (!file.exists(x)) {
+      stop(label, " file does not exist: ", x, call. = FALSE)
+    }
+    return(read.csv(x, stringsAsFactors = FALSE, check.names = FALSE))
+  }
+
+  stop(label, " must be a data frame or a single CSV path.", call. = FALSE)
+}
+
+.is_blank <- function(x) {
+  is.null(x) || length(x) == 0 || is.na(x[1]) || trimws(as.character(x[1])) == ""
+}
+
+.row_value <- function(row, name, default = NA) {
+  if (!name %in% names(row)) {
+    return(default)
+  }
+
+  value <- row[[name]][1]
+  if (length(value) == 0) {
+    return(default)
+  }
+
+  value
+}
+
+.row_character <- function(row, name, default = NA_character_) {
+  value <- .row_value(row, name, default)
+  if (.is_blank(value)) {
+    return(default)
+  }
+  as.character(value)
+}
+
+.row_numeric <- function(row, name, default = NA_real_) {
+  value <- .row_value(row, name, default)
+  if (.is_blank(value)) {
+    return(default)
+  }
+  suppressWarnings(as.numeric(value))
+}
+
+.recodeflow_required_columns <- function(data, required, label) {
+  missing <- setdiff(required, names(data))
+  if (length(missing) > 0) {
+    stop(label, " is missing required column(s): ", paste(missing, collapse = ", "), call. = FALSE)
+  }
+}
+
+.filter_recodeflow_by_database <- function(data, databaseStart, allow_empty = TRUE) {
+  if (is.null(databaseStart) || !"databaseStart" %in% names(data)) {
+    return(data)
+  }
+
+  data[.database_start_matches(data$databaseStart, databaseStart, allow_empty = allow_empty), , drop = FALSE]
+}
+
+.filter_recodeflow_details <- function(variable_details, variable, databaseStart) {
+  if (is.null(variable_details)) {
+    return(NULL)
+  }
+
+  details <- variable_details[variable_details$variable == variable, , drop = FALSE]
+  .filter_recodeflow_by_database(details, databaseStart, allow_empty = TRUE)
+}
+
+.recodeflow_variable_kind <- function(var_row) {
+  rtype <- tolower(.row_character(var_row, "rType", ""))
+  variable_type <- tolower(.row_character(var_row, "variableType", ""))
+
+  if (rtype == "date" || variable_type == "date") {
+    return("date")
+  }
+  if (variable_type == "categorical" || rtype %in% c("factor", "character", "logical")) {
+    return("categorical")
+  }
+  if (variable_type == "continuous" || rtype %in% c("integer", "double", "numeric")) {
+    return("continuous")
+  }
+
+  stop(
+    "Variable '", .row_character(var_row, "variable", "<unknown>"),
+    "' has unsupported variableType/rType combination: variableType = '",
+    variable_type, "', rType = '", rtype, "'.",
+    call. = FALSE
+  )
+}
+
+.recodeflow_rtype <- function(var_row, kind) {
+  rtype <- tolower(.row_character(var_row, "rType", ""))
+  if (rtype != "") {
+    if (rtype == "numeric") {
+      return("double")
+    }
+    return(rtype)
+  }
+
+  switch(
+    kind,
+    continuous = "double",
+    categorical = "factor",
+    date = "date"
+  )
+}
+
+.parse_single_date <- function(value) {
+  if (.is_blank(value)) {
+    return(NULL)
+  }
+
+  parsed <- parse_range_notation(paste0("[", value, ",", value, "]"))
+  if (!is.null(parsed) && identical(parsed$type, "date")) {
+    return(c(parsed$min, parsed$max))
+  }
+
+  NULL
+}
+
+.recodeflow_valid_rows <- function(details) {
+  if (is.null(details) || nrow(details) == 0) {
+    return(details)
+  }
+
+  rec_start <- as.character(details$recStart)
+  rec_end <- if ("recEnd" %in% names(details)) as.character(details$recEnd) else rep("", nrow(details))
+
+  keep <- !is.na(rec_start) &
+    rec_start != "" &
+    rec_start != "else" &
+    !grepl("^garbage_", rec_start, ignore.case = TRUE) &
+    !grepl("^NA::", rec_end) &
+    !grepl("^DerivedVar::", rec_start) &
+    !grepl("^Func::", rec_end)
+
+  details[keep, , drop = FALSE]
+}
+
+.recodeflow_range <- function(details, variable, kind) {
+  valid_rows <- .recodeflow_valid_rows(details)
+  if (is.null(valid_rows) || nrow(valid_rows) == 0) {
+    stop("Variable '", variable, "' has no valid recodeflow detail rows for range extraction.", call. = FALSE)
+  }
+
+  for (i in seq_len(nrow(valid_rows))) {
+    rec_start <- valid_rows$recStart[i]
+    parsed <- parse_range_notation(rec_start)
+
+    if (kind == "date") {
+      if (!is.null(parsed) && identical(parsed$type, "date")) {
+        return(c(parsed$min, parsed$max))
+      }
+
+      single_date <- .parse_single_date(rec_start)
+      if (!is.null(single_date)) {
+        return(single_date)
+      }
+    } else {
+      if (!is.null(parsed) && parsed$type %in% c("integer", "continuous", "single_value")) {
+        return(c(parsed$min, parsed$max))
+      }
+    }
+  }
+
+  stop("Variable '", variable, "' has no parseable ", kind, " range in recStart.", call. = FALSE)
+}
+
+.recodeflow_missing <- function(details) {
+  if (is.null(details) || nrow(details) == 0 || !"recEnd" %in% names(details)) {
+    return(list(codes = character(0), proportions = numeric(0)))
+  }
+
+  is_missing <- grepl("^NA::", details$recEnd) &
+    !is.na(details$recStart) &
+    details$recStart != "" &
+    details$recStart != "else"
+
+  missing_rows <- details[is_missing, , drop = FALSE]
+  if (nrow(missing_rows) == 0) {
+    return(list(codes = character(0), proportions = numeric(0)))
+  }
+
+  proportions <- if ("proportion" %in% names(missing_rows)) missing_rows$proportion else rep(NA_real_, nrow(missing_rows))
+  proportions[is.na(proportions)] <- 0
+
+  list(
+    codes = as.character(missing_rows$recStart),
+    proportions = as.numeric(proportions)
+  )
+}
+
+.recodeflow_distribution <- function(var_row, details) {
+  distribution <- tolower(.row_character(var_row, "distribution", ""))
+  if (distribution != "") {
+    return(distribution)
+  }
+
+  params <- tryCatch(
+    extract_distribution_params(details),
+    error = function(e) list(distribution = "uniform")
+  )
+  params$distribution %||% "uniform"
+}
+
+.recodeflow_garbage_rules <- function(var_row) {
+  rules <- list()
+
+  low_prop <- .row_numeric(var_row, "garbage_low_prop")
+  low_range <- .row_character(var_row, "garbage_low_range", "")
+  if (low_range == "[;]") {
+    low_range <- ""
+  }
+  if ((!is.na(low_prop) && low_prop > 0) || low_range != "") {
+    rules$low <- list(proportion = low_prop, range = low_range)
+  }
+
+  high_prop <- .row_numeric(var_row, "garbage_high_prop")
+  high_range <- .row_character(var_row, "garbage_high_range", "")
+  if (high_range == "[;]") {
+    high_range <- ""
+  }
+  if ((!is.na(high_prop) && high_prop > 0) || high_range != "") {
+    rules$high <- list(proportion = high_prop, range = high_range)
+  }
+
+  rules
+}
+
+.recodeflow_provenance <- function(variable, databaseStart = NULL) {
+  provenance <- list(adapter = "recodeflow", source = variable)
+  if (!is.null(databaseStart)) {
+    provenance$databaseStart <- paste(databaseStart, collapse = ",")
+  }
+  provenance
+}
+
+.recodeflow_to_spec_variable <- function(var_row, details, databaseStart) {
+  variable <- .row_character(var_row, "variable")
+  kind <- .recodeflow_variable_kind(var_row)
+  rtype <- .recodeflow_rtype(var_row, kind)
+  provenance <- .recodeflow_provenance(variable, databaseStart)
+  missing <- .recodeflow_missing(details)
+  garbage_rules <- .recodeflow_garbage_rules(var_row)
+
+  if (kind == "categorical") {
+    proportions <- extract_proportions(details, variable_name = variable)
+    if (length(proportions$categories) == 0) {
+      stop("Variable '", variable, "' has no valid categorical levels.", call. = FALSE)
+    }
+
+    return(mock_spec_categorical(
+      name = variable,
+      levels = proportions$categories,
+      proportions = proportions$category_proportions,
+      rtype = rtype,
+      missing_codes = names(proportions$missing),
+      missing_proportions = as.numeric(unlist(proportions$missing, use.names = FALSE)),
+      garbage_rules = garbage_rules,
+      provenance = provenance,
+      model_hint = "native"
+    ))
+  }
+
+  if (kind == "continuous") {
+    distribution <- .recodeflow_distribution(var_row, details)
+
+    return(mock_spec_continuous(
+      name = variable,
+      range = .recodeflow_range(details, variable, "continuous"),
+      distribution = distribution,
+      mean = .row_numeric(var_row, "mean"),
+      sd = .row_numeric(var_row, "sd"),
+      rtype = rtype,
+      missing_codes = missing$codes,
+      missing_proportions = missing$proportions,
+      garbage_rules = garbage_rules,
+      provenance = provenance,
+      model_hint = "native"
+    ))
+  }
+
+  source_format <- .row_character(var_row, "sourceFormat", "analysis")
+
+  .new_mock_spec_variable(
+    name = variable,
+    type = "date",
+    rtype = rtype,
+    distribution = .recodeflow_distribution(var_row, details),
+    range = .recodeflow_range(details, variable, "date"),
+    source_format = source_format,
+    missing_codes = missing$codes,
+    missing_proportions = missing$proportions,
+    garbage_rules = garbage_rules,
+    provenance = provenance,
+    model_hint = "native-postprocess",
+    rate = .row_numeric(var_row, "rate"),
+    shape = .row_numeric(var_row, "shape"),
+    followup_min = .row_numeric(var_row, "followup_min"),
+    followup_max = .row_numeric(var_row, "followup_max"),
+    event_prop = .row_numeric(var_row, "event_prop")
+  )
+}
+
+#' Convert recodeflow metadata to a MockData specification
+#'
+#' `mock_spec_from_recodeflow()` adapts recodeflow-style `variables` and
+#' `variable_details` metadata into the normalized v0.4 `mock_spec` shape. It
+#' returns a validated specification; it does not generate data.
+#'
+#' @param variables Data frame or CSV path for recodeflow-style `variables`
+#'   metadata.
+#' @param variable_details Data frame, CSV path, or `NULL` for recodeflow-style
+#'   `variable_details` metadata.
+#' @param databaseStart Optional database/cycle token used to filter metadata by
+#'   exact comma-separated `databaseStart` values.
+#' @param role Character vector of role tokens to include. Defaults to
+#'   `"enabled"`. Use `NULL` to skip role filtering.
+#' @param exclude_derived Logical. If `TRUE`, exclude variables identified by
+#'   `DerivedVar::` or `Func::` rows in `variable_details`.
+#' @param spec_version Character version of the specification shape.
+#' @param model_hint Backend hint for the returned specification.
+#'
+#' @return A validated `mock_spec` object.
+#' @family mock specification APIs
+#' @seealso [mock_spec()], [mock_continuous()], [mock_categorical()],
+#'   [mock_date()]
+#'
+#' @examples
+#' variables <- data.frame(
+#'   variable = "age",
+#'   variableType = "Continuous",
+#'   rType = "integer",
+#'   role = "enabled",
+#'   distribution = "uniform"
+#' )
+#' details <- data.frame(
+#'   variable = "age",
+#'   recStart = "[18, 85]",
+#'   recEnd = "copy",
+#'   proportion = 1
+#' )
+#' spec <- mock_spec_from_recodeflow(variables, details)
+#' validate_mock_spec(spec)
+#'
+#' @export
+mock_spec_from_recodeflow <- function(variables,
+                                      variable_details = NULL,
+                                      databaseStart = NULL,
+                                      role = "enabled",
+                                      exclude_derived = TRUE,
+                                      spec_version = .mock_spec_version,
+                                      model_hint = "auto") {
+  variables <- .read_recodeflow_table(variables, "variables")
+  variables <- .migrate_garbage_aliases(variables)
+  .recodeflow_required_columns(variables, "variable", "variables")
+
+  if (!is.null(variable_details)) {
+    variable_details <- .read_recodeflow_table(variable_details, "variable_details")
+    .recodeflow_required_columns(variable_details, c("variable", "recStart"), "variable_details")
+  }
+
+  if (!is.null(role)) {
+    if (!"role" %in% names(variables)) {
+      stop("variables must have a 'role' column when role filtering is requested.", call. = FALSE)
+    }
+    variables <- variables[.role_matches(variables$role, role, ignore.case = TRUE), , drop = FALSE]
+  }
+
+  variables <- .filter_recodeflow_by_database(variables, databaseStart, allow_empty = TRUE)
+
+  if (nrow(variables) == 0) {
+    stop("No variables matched the requested role/database filters.", call. = FALSE)
+  }
+
+  if (isTRUE(exclude_derived) && !is.null(variable_details)) {
+    derived <- identify_derived_vars(variables, variable_details)
+    variables <- variables[!variables$variable %in% derived, , drop = FALSE]
+  }
+
+  if (nrow(variables) == 0) {
+    stop("No non-derived variables remain after filtering.", call. = FALSE)
+  }
+
+  spec_variables <- lapply(seq_len(nrow(variables)), function(i) {
+    var_row <- variables[i, , drop = FALSE]
+    details <- .filter_recodeflow_details(variable_details, var_row$variable[1], databaseStart)
+    .recodeflow_to_spec_variable(var_row, details, databaseStart)
+  })
+
+  provenance <- list(adapter = "recodeflow", source = "variables+variable_details")
+  if (!is.null(databaseStart)) {
+    provenance$databaseStart <- paste(databaseStart, collapse = ",")
+  }
+  if (!is.null(role)) {
+    provenance$role <- paste(role, collapse = ",")
+  }
+
+  mock_spec(
+    spec_variables,
+    spec_version = spec_version,
+    provenance = provenance,
+    model_hint = model_hint
+  )
+}
diff --git a/tests/testthat/test-recodeflow-mock-spec.R b/tests/testthat/test-recodeflow-mock-spec.R
new file mode 100644
index 0000000..97db9e7
--- /dev/null
+++ b/tests/testthat/test-recodeflow-mock-spec.R
@@ -0,0 +1,112 @@
+minimal_example_path <- function(...) {
+  file.path("..", "..", "inst", "extdata", "minimal-example", ...)
+}
+
+test_that("mock_spec_from_recodeflow converts minimal metadata", {
+  variables <- read.csv(
+    minimal_example_path("variables.csv"),
+    stringsAsFactors = FALSE,
+    check.names = FALSE
+  )
+  variable_details <- read.csv(
+    minimal_example_path("variable_details.csv"),
+    stringsAsFactors = FALSE,
+    check.names = FALSE
+  )
+
+  spec <- mock_spec_from_recodeflow(variables, variable_details)
+
+  expect_s3_class(spec, "mock_spec")
+  expect_equal(spec$provenance$adapter, "recodeflow")
+  expect_true(validate_mock_spec(spec)$valid)
+  expect_false("BMI_derived" %in% names(spec$variables))
+  expect_true(all(c("age", "smoking", "interview_date") %in% names(spec$variables)))
+
+  expect_equal(spec$variables$age$type, "continuous")
+  expect_equal(spec$variables$age$rtype, "integer")
+  expect_equal(spec$variables$age$distribution, "normal")
+  expect_equal(spec$variables$age$range, c(18, 100))
+  expect_equal(spec$variables$age$missing_codes, c("997", "998", "999"))
+  expect_length(spec$variables$age$garbage_rules, 0)
+
+  expect_equal(spec$variables$smoking$type, "categorical")
+  expect_equal(spec$variables$smoking$levels, c("1", "2", "3"))
+  expect_equal(sum(spec$variables$smoking$proportions), 1)
+
+  expect_equal(spec$variables$interview_date$type, "date")
+  expect_s3_class(spec$variables$interview_date$range, "Date")
+  expect_equal(spec$variables$interview_date$source_format, "analysis")
+})
+
+test_that("mock_spec_from_recodeflow filters exact role and databaseStart tokens", {
+  variables <- data.frame(
+    variable = c("age", "disabled_age", "cycle10_age"),
+    variableType = "Continuous",
+    rType = "integer",
+    role = c("enabled", "disabled", "enabled"),
+    databaseStart = c("cycle1, cycle2", "cycle1", "cycle10"),
+    distribution = "uniform",
+    stringsAsFactors = FALSE
+  )
+  details <- data.frame(
+    variable = c("age", "disabled_age", "cycle10_age"),
+    recStart = c("[18, 85]", "[18, 85]", "[18, 85]"),
+    recEnd = "copy",
+    databaseStart = c("cycle1", "cycle1", "cycle10"),
+    proportion = 1,
+    stringsAsFactors = FALSE
+  )
+
+  spec <- mock_spec_from_recodeflow(
+    variables,
+    details,
+    databaseStart = "cycle1",
+    role = "enabled"
+  )
+
+  expect_named(spec$variables, "age")
+})
+
+test_that("mock_spec_from_recodeflow preserves garbage and survival fields", {
+  variables <- read.csv(
+    minimal_example_path("variables.csv"),
+    stringsAsFactors = FALSE,
+    check.names = FALSE
+  )
+  variable_details <- read.csv(
+    minimal_example_path("variable_details.csv"),
+    stringsAsFactors = FALSE,
+    check.names = FALSE
+  )
+
+  spec <- mock_spec_from_recodeflow(variables, variable_details)
+
+  expect_equal(spec$variables$BMI$garbage_rules$low$proportion, 0.02)
+  expect_equal(spec$variables$BMI$garbage_rules$low$range, "[-10;15])")
+  expect_equal(spec$variables$BMI$garbage_rules$high$proportion, 0.01)
+
+  expect_equal(spec$variables$primary_event_date$distribution, "gompertz")
+  expect_equal(spec$variables$primary_event_date$event_prop, 0.3)
+  expect_equal(spec$variables$primary_event_date$followup_max, 5475)
+})
+
+test_that("mock_spec_from_recodeflow validates adapter inputs", {
+  variables <- data.frame(
+    variable = "age",
+    variableType = "Continuous",
+    rType = "integer",
+    role = "enabled",
+    stringsAsFactors = FALSE
+  )
+
+  expect_error(
+    mock_spec_from_recodeflow(variables, variable_details = NULL),
+    "no valid recodeflow detail rows"
+  )
+
+  variables$role <- "disabled"
+  expect_error(
+    mock_spec_from_recodeflow(variables, data.frame(variable = "age", recStart = "[18, 85]")),
+    "No variables matched"
+  )
+})

From b1895f5e0a9a8d107ffdf192cc38c098087933db Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 11:10:54 -0400
Subject: [PATCH 06/18] Harden recodeflow mock_spec adapter

---
 NEWS.md                                    |   5 +-
 R/mock_spec_recodeflow.R                   |  62 +++++++++++-
 tests/testthat/test-recodeflow-mock-spec.R | 104 ++++++++++++++++++++-
 3 files changed, 164 insertions(+), 7 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 9b9d9c5..b6558a3 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -10,7 +10,10 @@
   `mock_categorical()`, and `mock_date()` for simple use without
   recodeflow-style metadata tables.
 - Added `mock_spec_from_recodeflow()` to adapt recodeflow-style `variables`
-  and `variable_details` metadata into validated `mock_spec` objects.
+  and `variable_details` metadata into validated `mock_spec` objects while
+  preserving role/database filtering, categorical proportions, `recEnd`
+  missing-code semantics, valid ranges, garbage rules, date ranges, and
+  survival/date fields.
 - Added forward-compatible specification fields: `spec_version`, `provenance`,
   and `model_hint`.
 - Existing v0.3 generator APIs remain available while v0.4 internals are built.
diff --git a/R/mock_spec_recodeflow.R b/R/mock_spec_recodeflow.R
index 0274ce5..f713a15 100644
--- a/R/mock_spec_recodeflow.R
+++ b/R/mock_spec_recodeflow.R
@@ -14,7 +14,12 @@
     if (!file.exists(x)) {
       stop(label, " file does not exist: ", x, call. = FALSE)
     }
-    return(read.csv(x, stringsAsFactors = FALSE, check.names = FALSE))
+    return(read.csv(
+      x,
+      stringsAsFactors = FALSE,
+      check.names = FALSE,
+      na.strings = c("", "NA")
+    ))
   }
 
   stop(label, " must be a data frame or a single CSV path.", call. = FALSE)
@@ -50,7 +55,18 @@
   if (.is_blank(value)) {
     return(default)
   }
-  suppressWarnings(as.numeric(value))
+
+  numeric_value <- suppressWarnings(as.numeric(value))
+  if (is.na(numeric_value)) {
+    stop(
+      "Column '", name, "' for variable '",
+      .row_character(row, "variable", "<unknown>"),
+      "' must be numeric; got '", as.character(value), "'.",
+      call. = FALSE
+    )
+  }
+
+  numeric_value
 }
 
 .recodeflow_required_columns <- function(data, required, label) {
@@ -61,9 +77,15 @@
 }
 
 .filter_recodeflow_by_database <- function(data, databaseStart, allow_empty = TRUE) {
-  if (is.null(databaseStart) || !"databaseStart" %in% names(data)) {
+  if (is.null(databaseStart)) {
     return(data)
   }
+  if (!"databaseStart" %in% names(data)) {
+    stop(
+      "databaseStart filtering was requested, but metadata has no 'databaseStart' column.",
+      call. = FALSE
+    )
+  }
 
   data[.database_start_matches(data$databaseStart, databaseStart, allow_empty = allow_empty), , drop = FALSE]
 }
@@ -143,7 +165,7 @@
     !grepl("^garbage_", rec_start, ignore.case = TRUE) &
     !grepl("^NA::", rec_end) &
     !grepl("^DerivedVar::", rec_start) &
-    !grepl("^Func::", rec_end)
+    !grepl("^Func::", rec_start)
 
   details[keep, , drop = FALSE]
 }
@@ -209,7 +231,16 @@
 
   params <- tryCatch(
     extract_distribution_params(details),
-    error = function(e) list(distribution = "uniform")
+    error = function(e) {
+      warning(
+        "Could not infer distribution for variable '",
+        .row_character(var_row, "variable", "<unknown>"),
+        "' from details; using uniform. Reason: ",
+        conditionMessage(e),
+        call. = FALSE
+      )
+      list(distribution = "uniform")
+    }
   )
   params$distribution %||% "uniform"
 }
@@ -319,6 +350,20 @@
 #' `variable_details` metadata into the normalized v0.4 `mock_spec` shape. It
 #' returns a validated specification; it does not generate data.
 #'
+#' @details
+#' This adapter preserves recodeflow semantics instead of treating metadata as a
+#' generic table. It uses exact role and `databaseStart` token matching, parses
+#' valid ranges from `recStart`, classifies missing codes from `recEnd` values
+#' that begin with `NA::`, preserves categorical levels and proportions, carries
+#' `garbage_*` settings into `garbage_rules`, and stores survival/date fields
+#' such as `rate`, `shape`, `followup_min`, `followup_max`, and `event_prop` on
+#' date variables for later backend milestones.
+#'
+#' By default, variables identified by `DerivedVar::` or `Func::` rows are
+#' excluded because they should be evaluated after raw mock variables are
+#' generated. Set `exclude_derived = FALSE` only when you want those rows to
+#' appear in the adapter input and fail or be handled by later formula support.
+#'
 #' @param variables Data frame or CSV path for recodeflow-style `variables`
 #'   metadata.
 #' @param variable_details Data frame, CSV path, or `NULL` for recodeflow-style
@@ -386,6 +431,13 @@ mock_spec_from_recodeflow <- function(variables,
 
   if (isTRUE(exclude_derived) && !is.null(variable_details)) {
     derived <- identify_derived_vars(variables, variable_details)
+    removed <- intersect(variables$variable, derived)
+    if (length(removed) > 0) {
+      message(
+        "Excluding derived recodeflow variable(s): ",
+        paste(removed, collapse = ", ")
+      )
+    }
     variables <- variables[!variables$variable %in% derived, , drop = FALSE]
   }
 
diff --git a/tests/testthat/test-recodeflow-mock-spec.R b/tests/testthat/test-recodeflow-mock-spec.R
index 97db9e7..51fdcc4 100644
--- a/tests/testthat/test-recodeflow-mock-spec.R
+++ b/tests/testthat/test-recodeflow-mock-spec.R
@@ -31,7 +31,7 @@ test_that("mock_spec_from_recodeflow converts minimal metadata", {
 
   expect_equal(spec$variables$smoking$type, "categorical")
   expect_equal(spec$variables$smoking$levels, c("1", "2", "3"))
-  expect_equal(sum(spec$variables$smoking$proportions), 1)
+  expect_equal(spec$variables$smoking$proportions, c(0.5, 0.3, 0.17) / 0.97)
 
   expect_equal(spec$variables$interview_date$type, "date")
   expect_s3_class(spec$variables$interview_date$range, "Date")
@@ -84,6 +84,7 @@ test_that("mock_spec_from_recodeflow preserves garbage and survival fields", {
   expect_equal(spec$variables$BMI$garbage_rules$low$proportion, 0.02)
   expect_equal(spec$variables$BMI$garbage_rules$low$range, "[-10;15])")
   expect_equal(spec$variables$BMI$garbage_rules$high$proportion, 0.01)
+  expect_equal(spec$variables$BMI$garbage_rules$high$range, "[60;150]")
 
   expect_equal(spec$variables$primary_event_date$distribution, "gompertz")
   expect_equal(spec$variables$primary_event_date$event_prop, 0.3)
@@ -110,3 +111,104 @@ test_that("mock_spec_from_recodeflow validates adapter inputs", {
     "No variables matched"
   )
 })
+
+test_that("mock_spec_from_recodeflow fails loudly on missing databaseStart column", {
+  variables <- data.frame(
+    variable = "age",
+    variableType = "Continuous",
+    rType = "integer",
+    role = "enabled",
+    stringsAsFactors = FALSE
+  )
+  details <- data.frame(
+    variable = "age",
+    recStart = "[18, 85]",
+    recEnd = "copy",
+    proportion = 1,
+    stringsAsFactors = FALSE
+  )
+
+  expect_error(
+    mock_spec_from_recodeflow(variables, details, databaseStart = "cycle1"),
+    "no 'databaseStart' column"
+  )
+})
+
+test_that("mock_spec_from_recodeflow rejects non-numeric scalar fields", {
+  variables <- data.frame(
+    variable = "age",
+    variableType = "Continuous",
+    rType = "integer",
+    role = "enabled",
+    distribution = "normal",
+    mean = "middle",
+    sd = 10,
+    stringsAsFactors = FALSE
+  )
+  details <- data.frame(
+    variable = "age",
+    recStart = "[18, 85]",
+    recEnd = "copy",
+    proportion = 1,
+    stringsAsFactors = FALSE
+  )
+
+  expect_error(
+    mock_spec_from_recodeflow(variables, details),
+    "must be numeric"
+  )
+})
+
+test_that("mock_spec_from_recodeflow excludes Func rows from valid ranges", {
+  variables <- data.frame(
+    variable = "age",
+    variableType = "Continuous",
+    rType = "integer",
+    role = "enabled",
+    distribution = "uniform",
+    stringsAsFactors = FALSE
+  )
+  details <- data.frame(
+    variable = c("age", "age"),
+    recStart = c("Func::age_cleanup", "[18, 85]"),
+    recEnd = c("copy", "copy"),
+    proportion = c(NA, 1),
+    stringsAsFactors = FALSE
+  )
+
+  spec <- mock_spec_from_recodeflow(variables, details)
+
+  expect_equal(spec$variables$age$range, c(18, 85))
+})
+
+test_that("mock_spec_from_recodeflow matches direct adapter specs modulo provenance", {
+  variables <- data.frame(
+    variable = "age",
+    variableType = "Continuous",
+    rType = "integer",
+    role = "enabled",
+    distribution = "uniform",
+    stringsAsFactors = FALSE
+  )
+  details <- data.frame(
+    variable = "age",
+    recStart = "[18, 85]",
+    recEnd = "copy",
+    proportion = 1,
+    stringsAsFactors = FALSE
+  )
+
+  recodeflow_spec <- mock_spec_from_recodeflow(variables, details)
+  direct_spec <- mock_continuous(
+    "age",
+    range = c(18, 85),
+    rtype = "integer",
+    missing_codes = character(0)
+  )
+
+  recodeflow_spec$provenance <- direct_spec$provenance
+  recodeflow_spec$variables$age$provenance <- direct_spec$variables$age$provenance
+  recodeflow_spec$variables$age$model_hint <- direct_spec$variables$age$model_hint
+
+  expect_equal(recodeflow_spec, direct_spec)
+})

From a6aa4cf4ab3f00aca6a3a6ef4857b1512c8bee00 Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 11:24:19 -0400
Subject: [PATCH 07/18] Add native mock_spec backend

---
 NAMESPACE                            |   1 +
 NEWS.md                              |   2 +
 R/mock_spec_native.R                 | 264 +++++++++++++++++++++++++++
 tests/testthat/test-native-backend.R | 123 +++++++++++++
 4 files changed, 390 insertions(+)
 create mode 100644 R/mock_spec_native.R
 create mode 100644 tests/testthat/test-native-backend.R

diff --git a/NAMESPACE b/NAMESPACE
index b1b6f71..76b5988 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -15,6 +15,7 @@ export(create_wide_survival_data)
 export(extract_distribution_params)
 export(extract_proportions)
 export(generate_garbage_values)
+export(generate_mock_data_native)
 export(get_cycle_variables)
 export(get_enabled_variables)
 export(get_raw_var_dependencies)
diff --git a/NEWS.md b/NEWS.md
index b6558a3..5e14789 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -14,6 +14,8 @@
   preserving role/database filtering, categorical proportions, `recEnd`
   missing-code semantics, valid ranges, garbage rules, date ranges, and
   survival/date fields.
+- Added `generate_mock_data_native()` to generate baseline valid mock data from
+  `mock_spec` objects with the native R backend.
 - Added forward-compatible specification fields: `spec_version`, `provenance`,
   and `model_hint`.
 - Existing v0.3 generator APIs remain available while v0.4 internals are built.
diff --git a/R/mock_spec_native.R b/R/mock_spec_native.R
new file mode 100644
index 0000000..1e0d812
--- /dev/null
+++ b/R/mock_spec_native.R
@@ -0,0 +1,264 @@
+# ==============================================================================
+# MockData v0.4 Native Backend
+# ==============================================================================
+# Baseline native generation from mock_spec. Post-processing for missing codes,
+# garbage, diagnostics, and richer rType handling lands in later milestones.
+# ==============================================================================
+
+.with_mock_seed <- function(seed, expr) {
+  if (is.null(seed)) {
+    return(force(expr))
+  }
+
+  if (!is.numeric(seed) || length(seed) != 1 || is.na(seed) || seed != floor(seed)) {
+    stop("seed must be a single whole number.", call. = FALSE)
+  }
+
+  had_seed <- exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
+  if (had_seed) {
+    old_seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
+  }
+
+  on.exit({
+    if (had_seed) {
+      assign(".Random.seed", old_seed, envir = .GlobalEnv)
+    } else if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) {
+      rm(".Random.seed", envir = .GlobalEnv)
+    }
+  }, add = TRUE)
+
+  set.seed(seed)
+  force(expr)
+}
+
+.empty_native_data <- function(n) {
+  data.frame(row.names = seq_len(n))
+}
+
+.sample_indices <- function(n_levels, n, prob = NULL) {
+  if (n == 0) {
+    return(integer(0))
+  }
+  sample.int(n_levels, size = n, replace = TRUE, prob = prob)
+}
+
+.native_truncated_normal <- function(n, mean, sd, range) {
+  if (n == 0) {
+    return(numeric(0))
+  }
+
+  values <- rep(NA_real_, n)
+  remaining <- seq_len(n)
+  attempts <- 0
+  max_attempts <- 100
+
+  while (length(remaining) > 0 && attempts < max_attempts) {
+    draws <- stats::rnorm(length(remaining), mean = mean, sd = sd)
+    valid <- draws >= range[[1]] & draws <= range[[2]]
+    values[remaining[valid]] <- draws[valid]
+    remaining <- remaining[!valid]
+    attempts <- attempts + 1
+  }
+
+  if (length(remaining) > 0) {
+    warning(
+      "Could not fill all truncated-normal values by rejection sampling; ",
+      "using uniform draws for the remaining values.",
+      call. = FALSE
+    )
+    values[remaining] <- stats::runif(length(remaining), range[[1]], range[[2]])
+  }
+
+  values
+}
+
+.coerce_native_continuous <- function(values, rtype, variable_name) {
+  if (rtype == "integer") {
+    return(as.integer(round(values)))
+  }
+  if (rtype %in% c("double", "numeric")) {
+    return(as.numeric(values))
+  }
+
+  stop(
+    "Variable '", variable_name, "' has unsupported native continuous rType '",
+    rtype, "'.",
+    call. = FALSE
+  )
+}
+
+.coerce_native_categorical <- function(values, levels, rtype, variable_name) {
+  if (rtype == "factor") {
+    return(factor(values, levels = levels))
+  }
+  if (rtype == "character") {
+    return(as.character(values))
+  }
+  if (rtype == "integer") {
+    converted <- suppressWarnings(as.integer(values))
+    if (any(is.na(converted) & !is.na(values))) {
+      stop(
+        "Variable '", variable_name,
+        "' integer categorical generation requires integer-like levels.",
+        call. = FALSE
+      )
+    }
+    return(converted)
+  }
+  if (rtype %in% c("double", "numeric")) {
+    converted <- suppressWarnings(as.numeric(values))
+    if (any(is.na(converted) & !is.na(values))) {
+      stop(
+        "Variable '", variable_name,
+        "' numeric categorical generation requires numeric-like levels.",
+        call. = FALSE
+      )
+    }
+    return(converted)
+  }
+  if (rtype == "logical") {
+    if (!all(values %in% c("TRUE", "FALSE", "true", "false", "1", "0", TRUE, FALSE))) {
+      stop(
+        "Variable '", variable_name,
+        "' logical categorical generation requires TRUE/FALSE or 1/0 levels.",
+        call. = FALSE
+      )
+    }
+    return(values %in% c("TRUE", "true", "1", TRUE))
+  }
+
+  stop(
+    "Variable '", variable_name, "' has unsupported native categorical rType '",
+    rtype, "'.",
+    call. = FALSE
+  )
+}
+
+.coerce_native_date <- function(values, rtype, variable_name) {
+  if (rtype == "date") {
+    return(values)
+  }
+  if (rtype == "character") {
+    return(as.character(values))
+  }
+
+  stop(
+    "Variable '", variable_name, "' has unsupported native date rType '",
+    rtype, "'.",
+    call. = FALSE
+  )
+}
+
+.generate_native_continuous <- function(variable, n) {
+  distribution <- tolower(variable$distribution %||% "uniform")
+
+  if (distribution == "uniform") {
+    values <- stats::runif(n, variable$range[[1]], variable$range[[2]])
+  } else if (distribution == "normal") {
+    values <- .native_truncated_normal(n, variable$mean, variable$sd, variable$range)
+  } else {
+    stop(
+      "Native backend does not yet support continuous distribution '",
+      distribution, "' for variable '", variable$name, "'.",
+      call. = FALSE
+    )
+  }
+
+  .coerce_native_continuous(values, variable$rtype, variable$name)
+}
+
+.generate_native_categorical <- function(variable, n) {
+  levels <- as.character(variable$levels)
+  prob <- variable$proportions
+  if (is.null(prob)) {
+    prob <- rep(1 / length(levels), length(levels))
+  }
+
+  values <- levels[.sample_indices(length(levels), n, prob)]
+  .coerce_native_categorical(values, levels, variable$rtype, variable$name)
+}
+
+.generate_native_date <- function(variable, n) {
+  distribution <- tolower(variable$distribution %||% "uniform")
+  if (distribution != "uniform") {
+    stop(
+      "Native backend does not yet support date distribution '",
+      distribution, "' for variable '", variable$name, "'.",
+      call. = FALSE
+    )
+  }
+
+  if (n == 0) {
+    values <- as.Date(character(0))
+  } else {
+    range_numeric <- as.integer(variable$range)
+    offsets <- .sample_indices(
+      range_numeric[[2]] - range_numeric[[1]] + 1,
+      n
+    ) - 1
+    values <- as.Date(range_numeric[[1]] + offsets, origin = "1970-01-01")
+  }
+
+  .coerce_native_date(values, variable$rtype, variable$name)
+}
+
+.generate_native_variable <- function(variable, n) {
+  if (variable$type == "continuous") {
+    return(.generate_native_continuous(variable, n))
+  }
+  if (variable$type == "categorical") {
+    return(.generate_native_categorical(variable, n))
+  }
+  if (variable$type == "date") {
+    return(.generate_native_date(variable, n))
+  }
+
+  stop(
+    "Native backend does not support variable type '", variable$type,
+    "' for variable '", variable$name, "'.",
+    call. = FALSE
+  )
+}
+
+#' Generate mock data with the native backend
+#'
+#' `generate_mock_data_native()` consumes a validated `mock_spec` and generates
+#' baseline valid values using MockData's native R backend. This milestone does
+#' not yet apply missing-code injection, garbage values, diagnostics, formula
+#' evaluation, or optional `simstudy` features.
+#'
+#' @param spec A `mock_spec` object.
+#' @param n Non-negative whole number of rows to generate.
+#' @param seed Optional whole-number random seed. The previous R random state is
+#'   restored after generation.
+#'
+#' @return A data frame with one column per `mock_spec` variable and `n` rows.
+#' @family mock generation APIs
+#' @seealso [mock_spec()], [mock_continuous()], [mock_spec_from_recodeflow()]
+#'
+#' @examples
+#' spec <- mock_spec(
+#'   mock_spec_continuous("age", range = c(18, 85), rtype = "integer"),
+#'   mock_spec_categorical(
+#'     "smoking",
+#'     levels = c("never", "former", "current"),
+#'     proportions = c(0.5, 0.3, 0.2)
+#'   )
+#' )
+#' data <- generate_mock_data_native(spec, n = 10, seed = 1)
+#' head(data)
+#'
+#' @export
+generate_mock_data_native <- function(spec, n, seed = NULL) {
+  validate_mock_spec(spec, n = n, strict = TRUE)
+
+  .with_mock_seed(seed, {
+    if (length(spec$variables) == 0) {
+      .empty_native_data(n)
+    } else {
+      columns <- lapply(spec$variables, .generate_native_variable, n = n)
+      names(columns) <- names(spec$variables)
+      as.data.frame(columns, stringsAsFactors = FALSE, check.names = FALSE)
+    }
+  })
+}
diff --git a/tests/testthat/test-native-backend.R b/tests/testthat/test-native-backend.R
new file mode 100644
index 0000000..7b3c56d
--- /dev/null
+++ b/tests/testthat/test-native-backend.R
@@ -0,0 +1,123 @@
+test_that("generate_mock_data_native generates baseline direct specs", {
+  spec <- mock_spec(
+    mock_spec_continuous(
+      "age",
+      range = c(18, 85),
+      distribution = "normal",
+      mean = 50,
+      sd = 12,
+      rtype = "integer"
+    ),
+    mock_spec_categorical(
+      "smoking",
+      levels = c("never", "former", "current"),
+      proportions = c(0.5, 0.3, 0.2),
+      rtype = "character"
+    ),
+    mock_spec_date(
+      "interview_date",
+      range = as.Date(c("2001-01-01", "2005-12-31"))
+    )
+  )
+
+  result <- generate_mock_data_native(spec, n = 500, seed = 101)
+
+  expect_s3_class(result, "data.frame")
+  expect_equal(nrow(result), 500)
+  expect_named(result, c("age", "smoking", "interview_date"))
+  expect_true(all(result$age >= 18 & result$age <= 85))
+  expect_type(result$age, "integer")
+  expect_true(all(result$smoking %in% c("never", "former", "current")))
+  expect_s3_class(result$interview_date, "Date")
+  expect_true(all(result$interview_date >= as.Date("2001-01-01")))
+  expect_true(all(result$interview_date <= as.Date("2005-12-31")))
+})
+
+test_that("generate_mock_data_native is reproducible without leaking RNG state", {
+  spec <- mock_continuous("age", range = c(18, 85), rtype = "integer")
+
+  set.seed(999)
+  before <- runif(1)
+  result_1 <- generate_mock_data_native(spec, n = 10, seed = 42)
+  after_1 <- runif(1)
+
+  set.seed(999)
+  expect_equal(runif(1), before)
+  result_2 <- generate_mock_data_native(spec, n = 10, seed = 42)
+  after_2 <- runif(1)
+
+  expect_equal(result_1, result_2)
+  expect_equal(after_1, after_2)
+})
+
+test_that("generate_mock_data_native handles empty specs and n = 0", {
+  empty <- generate_mock_data_native(mock_spec(), n = 5, seed = 1)
+  expect_s3_class(empty, "data.frame")
+  expect_equal(nrow(empty), 5)
+  expect_equal(ncol(empty), 0)
+
+  spec <- mock_categorical("smoking", levels = c("never", "former", "current"))
+  zero <- generate_mock_data_native(spec, n = 0, seed = 1)
+  expect_equal(nrow(zero), 0)
+  expect_named(zero, "smoking")
+})
+
+test_that("generate_mock_data_native consumes simple recodeflow specs", {
+  variables <- data.frame(
+    variable = c("age", "smoking", "interview_date"),
+    variableType = c("Continuous", "Categorical", "Continuous"),
+    rType = c("integer", "character", "date"),
+    role = "enabled",
+    distribution = c("uniform", "", "uniform"),
+    stringsAsFactors = FALSE
+  )
+  details <- data.frame(
+    variable = c("age", "smoking", "smoking", "interview_date"),
+    recStart = c("[18, 85]", "never", "current", "[2001-01-01,2001-01-31]"),
+    recEnd = c("copy", "never", "current", "copy"),
+    proportion = c(1, 0.75, 0.25, 1),
+    stringsAsFactors = FALSE
+  )
+
+  spec <- mock_spec_from_recodeflow(variables, details)
+  result <- generate_mock_data_native(spec, n = 100, seed = 55)
+
+  expect_named(result, c("age", "smoking", "interview_date"))
+  expect_true(all(result$age >= 18 & result$age <= 85))
+  expect_true(all(result$smoking %in% c("never", "current")))
+  expect_s3_class(result$interview_date, "Date")
+})
+
+test_that("generate_mock_data_native fails loudly for unsupported native features", {
+  survival_like <- mock_spec(
+    mock_spec_date(
+      "event_date",
+      range = as.Date(c("2001-01-01", "2005-12-31"))
+    ),
+    validate = FALSE
+  )
+  survival_like$variables$event_date$distribution <- "gompertz"
+
+  expect_error(
+    generate_mock_data_native(survival_like, n = 10),
+    "does not yet support date distribution"
+  )
+
+  expect_error(
+    generate_mock_data_native(list(), n = 10),
+    "mock_spec"
+  )
+})
+
+test_that("generate_mock_data_native rejects lossy categorical coercion", {
+  spec <- mock_categorical(
+    "smoking",
+    levels = c("never", "former", "current"),
+    rtype = "integer"
+  )
+
+  expect_error(
+    generate_mock_data_native(spec, n = 10, seed = 1),
+    "integer-like levels"
+  )
+})

From 501698b6ae70a865af84b29afe8b1206d28dd819 Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 11:33:49 -0400
Subject: [PATCH 08/18] Harden native mock_spec backend

---
 R/mock_spec_native.R                 | 37 ++++++++++-
 tests/testthat/test-native-backend.R | 93 ++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+), 3 deletions(-)

diff --git a/R/mock_spec_native.R b/R/mock_spec_native.R
index 1e0d812..990cbf2 100644
--- a/R/mock_spec_native.R
+++ b/R/mock_spec_native.R
@@ -42,7 +42,30 @@
   sample.int(n_levels, size = n, replace = TRUE, prob = prob)
 }
 
-.native_truncated_normal <- function(n, mean, sd, range) {
+.native_formula_variables <- function(spec) {
+  names(Filter(function(variable) {
+    formula <- variable$formula
+    !is.null(formula) &&
+      !(is.character(formula) && length(formula) == 1 && (is.na(formula) || trimws(formula) == ""))
+  }, spec$variables))
+}
+
+.check_native_backend_scope <- function(spec) {
+  formula_variables <- .native_formula_variables(spec)
+  if (length(formula_variables) > 0) {
+    stop(
+      "Formula evaluation is not yet implemented in the M4 native backend. ",
+      "Formula variable(s): ",
+      paste(formula_variables, collapse = ", "),
+      ". Expected in a later formula/dependency milestone.",
+      call. = FALSE
+    )
+  }
+
+  invisible(TRUE)
+}
+
+.native_truncated_normal <- function(n, mean, sd, range, variable_name) {
   if (n == 0) {
     return(numeric(0))
   }
@@ -62,7 +85,8 @@
 
   if (length(remaining) > 0) {
     warning(
-      "Could not fill all truncated-normal values by rejection sampling; ",
+      "Variable '", variable_name,
+      "': could not fill all truncated-normal values by rejection sampling; ",
       "using uniform draws for the remaining values.",
       call. = FALSE
     )
@@ -155,7 +179,13 @@
   if (distribution == "uniform") {
     values <- stats::runif(n, variable$range[[1]], variable$range[[2]])
   } else if (distribution == "normal") {
-    values <- .native_truncated_normal(n, variable$mean, variable$sd, variable$range)
+    values <- .native_truncated_normal(
+      n,
+      variable$mean,
+      variable$sd,
+      variable$range,
+      variable$name
+    )
   } else {
     stop(
       "Native backend does not yet support continuous distribution '",
@@ -251,6 +281,7 @@
 #' @export
 generate_mock_data_native <- function(spec, n, seed = NULL) {
   validate_mock_spec(spec, n = n, strict = TRUE)
+  .check_native_backend_scope(spec)
 
   .with_mock_seed(seed, {
     if (length(spec$variables) == 0) {
diff --git a/tests/testthat/test-native-backend.R b/tests/testthat/test-native-backend.R
index 7b3c56d..9904878 100644
--- a/tests/testthat/test-native-backend.R
+++ b/tests/testthat/test-native-backend.R
@@ -33,6 +33,39 @@ test_that("generate_mock_data_native generates baseline direct specs", {
   expect_true(all(result$interview_date <= as.Date("2005-12-31")))
 })
 
+test_that("generate_mock_data_native preserves statistical contracts", {
+  spec <- mock_spec(
+    mock_spec_continuous("uniform_age", range = c(20, 80), rtype = "double"),
+    mock_spec_continuous(
+      "normal_age",
+      range = c(18, 85),
+      distribution = "normal",
+      mean = 50,
+      sd = 12,
+      rtype = "double"
+    ),
+    mock_spec_categorical(
+      "smoking",
+      levels = c("never", "former", "current"),
+      proportions = c(0.5, 0.3, 0.2),
+      rtype = "character"
+    )
+  )
+
+  result <- generate_mock_data_native(spec, n = 5000, seed = 202)
+
+  expect_equal(mean(result$uniform_age), 50, tolerance = 1)
+  expect_equal(stats::sd(result$uniform_age), 60 / sqrt(12), tolerance = 1)
+  expect_equal(mean(result$normal_age), 50, tolerance = 1)
+  expect_equal(stats::sd(result$normal_age), 12, tolerance = 1)
+
+  observed <- prop.table(table(factor(
+    result$smoking,
+    levels = c("never", "former", "current")
+  )))
+  expect_equal(as.numeric(observed), c(0.5, 0.3, 0.2), tolerance = 0.03)
+})
+
 test_that("generate_mock_data_native is reproducible without leaking RNG state", {
   spec <- mock_continuous("age", range = c(18, 85), rtype = "integer")
 
@@ -60,6 +93,22 @@ test_that("generate_mock_data_native handles empty specs and n = 0", {
   zero <- generate_mock_data_native(spec, n = 0, seed = 1)
   expect_equal(nrow(zero), 0)
   expect_named(zero, "smoking")
+
+  continuous_zero <- generate_mock_data_native(
+    mock_continuous("age", range = c(18, 85)),
+    n = 0,
+    seed = 1
+  )
+  expect_equal(nrow(continuous_zero), 0)
+  expect_named(continuous_zero, "age")
+
+  one <- generate_mock_data_native(
+    mock_continuous("age", range = c(18, 85), rtype = "integer"),
+    n = 1,
+    seed = 1
+  )
+  expect_equal(nrow(one), 1)
+  expect_true(one$age >= 18 && one$age <= 85)
 })
 
 test_that("generate_mock_data_native consumes simple recodeflow specs", {
@@ -86,6 +135,23 @@ test_that("generate_mock_data_native consumes simple recodeflow specs", {
   expect_true(all(result$age >= 18 & result$age <= 85))
   expect_true(all(result$smoking %in% c("never", "current")))
   expect_s3_class(result$interview_date, "Date")
+
+  direct_spec <- mock_spec(
+    mock_spec_continuous("age", range = c(18, 85), rtype = "integer"),
+    mock_spec_categorical(
+      "smoking",
+      levels = c("never", "current"),
+      proportions = c(0.75, 0.25),
+      rtype = "character"
+    ),
+    mock_spec_date(
+      "interview_date",
+      range = as.Date(c("2001-01-01", "2001-01-31"))
+    )
+  )
+
+  direct_result <- generate_mock_data_native(direct_spec, n = 100, seed = 55)
+  expect_equal(result, direct_result)
 })
 
 test_that("generate_mock_data_native fails loudly for unsupported native features", {
@@ -109,6 +175,33 @@ test_that("generate_mock_data_native fails loudly for unsupported native feature
   )
 })
 
+test_that("generate_mock_data_native rejects formula specs until evaluator milestone", {
+  variable <- mock_spec_continuous("bmi", range = c(15, 50))
+  variable$formula <- "weight / height^2"
+  spec <- mock_spec(variable)
+
+  expect_error(
+    generate_mock_data_native(spec, n = 10, seed = 1),
+    "Formula evaluation is not yet implemented"
+  )
+})
+
+test_that("generate_mock_data_native warns on truncated normal fallback", {
+  spec <- mock_spec(mock_spec_continuous(
+    "age",
+    range = c(0, 1),
+    distribution = "normal",
+    mean = 1000,
+    sd = 1
+  ))
+
+  expect_warning(
+    result <- generate_mock_data_native(spec, n = 5, seed = 1),
+    "Variable 'age'"
+  )
+  expect_true(all(result$age >= 0 & result$age <= 1))
+})
+
 test_that("generate_mock_data_native rejects lossy categorical coercion", {
   spec <- mock_categorical(
     "smoking",

From fb6ae3f1aa865650623f55945c35ab2b95b96a91 Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 19:26:23 -0400
Subject: [PATCH 09/18] Add mock_spec post-processing layer

---
 NAMESPACE                                   |   1 +
 NEWS.md                                     |   3 +
 R/mock_spec_postprocess.R                   | 312 ++++++++++++++++++++
 tests/testthat/test-mock-spec-postprocess.R | 162 ++++++++++
 4 files changed, 478 insertions(+)
 create mode 100644 R/mock_spec_postprocess.R
 create mode 100644 tests/testthat/test-mock-spec-postprocess.R

diff --git a/NAMESPACE b/NAMESPACE
index 76b5988..52cae5a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -37,6 +37,7 @@ export(mock_spec_date)
 export(mock_spec_from_recodeflow)
 export(parse_range_notation)
 export(parse_variable_start)
+export(postprocess_mock_data)
 export(read_mock_data_config)
 export(read_mock_data_config_details)
 export(sample_with_proportions)
diff --git a/NEWS.md b/NEWS.md
index 5e14789..fe6fda1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -16,6 +16,9 @@
   survival/date fields.
 - Added `generate_mock_data_native()` to generate baseline valid mock data from
   `mock_spec` objects with the native R backend.
+- Added `postprocess_mock_data()` to apply `mock_spec` missing-code and
+  garbage-value rules after baseline generation, with diagnostics that
+  distinguish assigned missing/garbage rows from naturally drawn values.
 - Added forward-compatible specification fields: `spec_version`, `provenance`,
   and `model_hint`.
 - Existing v0.3 generator APIs remain available while v0.4 internals are built.
diff --git a/R/mock_spec_postprocess.R b/R/mock_spec_postprocess.R
new file mode 100644
index 0000000..f71a6d5
--- /dev/null
+++ b/R/mock_spec_postprocess.R
@@ -0,0 +1,312 @@
+# ==============================================================================
+# MockData v0.4 Post-processing Layer
+# ==============================================================================
+# Applies missing-code and garbage-value rules after baseline generation while
+# preserving diagnostics that distinguish assigned states from coincidental
+# value collisions.
+# ==============================================================================
+
+.postprocess_empty_diagnostics <- function(spec, n) {
+  variables <- lapply(spec$variables, function(variable) {
+    list(
+      n = n,
+      preexisting_missing_code_indices = integer(0),
+      assigned_missing_indices = integer(0),
+      assigned_missing_codes = character(0),
+      assigned_garbage_indices = list(low = integer(0), high = integer(0)),
+      assigned_garbage_values = list(low = character(0), high = character(0))
+    )
+  })
+
+  list(
+    spec_version = spec$spec_version,
+    variables = variables
+  )
+}
+
+.values_match_codes <- function(values, codes) {
+  if (length(codes) == 0) {
+    return(rep(FALSE, length(values)))
+  }
+
+  as.character(values) %in% as.character(codes)
+}
+
+.sample_postprocess_indices <- function(candidates, n, avoid = integer(0)) {
+  if (n == 0) {
+    return(integer(0))
+  }
+  if (length(candidates) < n) {
+    stop("Not enough candidate rows are available for post-processing.", call. = FALSE)
+  }
+
+  preferred <- setdiff(candidates, avoid)
+  if (length(preferred) >= n) {
+    return(.sample_values(preferred, n))
+  }
+
+  c(
+    preferred,
+    .sample_values(setdiff(candidates, preferred), n - length(preferred))
+  )
+}
+
+.coerce_postprocess_values <- function(values, variable, target) {
+  if (inherits(target, "factor")) {
+    return(as.character(values))
+  }
+
+  if (inherits(target, "Date") || variable$rtype == "date") {
+    converted <- as.Date(values)
+    if (any(is.na(converted) & !is.na(values))) {
+      stop("Variable '", variable$name, "' has date post-processing values that cannot be parsed.", call. = FALSE)
+    }
+    return(converted)
+  }
+
+  if (is.integer(target) || variable$rtype == "integer") {
+    converted <- suppressWarnings(as.integer(round(as.numeric(values))))
+    if (any(is.na(converted) & !is.na(values))) {
+      stop("Variable '", variable$name, "' has integer post-processing values that cannot be parsed.", call. = FALSE)
+    }
+    return(converted)
+  }
+
+  if (is.numeric(target) || variable$rtype %in% c("double", "numeric")) {
+    converted <- suppressWarnings(as.numeric(values))
+    if (any(is.na(converted) & !is.na(values))) {
+      stop("Variable '", variable$name, "' has numeric post-processing values that cannot be parsed.", call. = FALSE)
+    }
+    return(converted)
+  }
+
+  if (is.logical(target) || variable$rtype == "logical") {
+    value_chr <- as.character(values)
+    if (!all(value_chr %in% c("TRUE", "FALSE", "true", "false", "1", "0"))) {
+      stop("Variable '", variable$name, "' has logical post-processing values that cannot be parsed.", call. = FALSE)
+    }
+    return(value_chr %in% c("TRUE", "true", "1"))
+  }
+
+  as.character(values)
+}
+
+.assign_postprocess_values <- function(target, indices, values) {
+  if (length(indices) == 0) {
+    return(target)
+  }
+
+  if (inherits(target, "factor")) {
+    missing_levels <- setdiff(as.character(values), levels(target))
+    if (length(missing_levels) > 0) {
+      levels(target) <- c(levels(target), missing_levels)
+    }
+  }
+
+  target[indices] <- values
+  target
+}
+
+.generate_garbage_for_rule <- function(rule, variable, n) {
+  if (n == 0) {
+    return(vector(mode = "character", length = 0))
+  }
+
+  parsed <- parse_range_notation(rule$range)
+  if (is.null(parsed)) {
+    stop(
+      "Variable '", variable$name, "' has an invalid garbage range: ",
+      rule$range,
+      call. = FALSE
+    )
+  }
+
+  if (identical(parsed$type, "date")) {
+    date_values <- seq(parsed$min, parsed$max, by = "day")
+    return(.sample_values(date_values, n, replace = TRUE))
+  }
+
+  if (identical(parsed$type, "integer") && !is.null(parsed$values)) {
+    return(.sample_values(parsed$values, n, replace = TRUE))
+  }
+
+  values <- stats::runif(n, parsed$min, parsed$max)
+  if (variable$type == "categorical" || variable$rtype == "integer") {
+    values <- round(values)
+  }
+
+  values
+}
+
+.postprocess_missing <- function(values, variable, diagnostics) {
+  if (length(variable$missing_codes) == 0) {
+    return(list(values = values, diagnostics = diagnostics))
+  }
+
+  available <- seq_along(values)
+  preexisting <- which(.values_match_codes(values, variable$missing_codes))
+  assigned <- integer(0)
+  assigned_codes <- character(0)
+
+  for (i in seq_along(variable$missing_codes)) {
+    proportion <- variable$missing_proportions[[i]]
+    n_assign <- round(length(values) * proportion)
+    if (n_assign == 0) {
+      next
+    }
+
+    code <- variable$missing_codes[[i]]
+    code_values <- rep(code, n_assign)
+    assign_idx <- .sample_postprocess_indices(
+      available,
+      n_assign,
+      avoid = union(preexisting, assigned)
+    )
+    coerced <- .coerce_postprocess_values(code_values, variable, values)
+    values <- .assign_postprocess_values(values, assign_idx, coerced)
+
+    available <- setdiff(available, assign_idx)
+    assigned <- c(assigned, assign_idx)
+    assigned_codes <- c(assigned_codes, as.character(code_values))
+  }
+
+  diagnostics$preexisting_missing_code_indices <- preexisting
+  diagnostics$assigned_missing_indices <- assigned
+  diagnostics$assigned_missing_codes <- assigned_codes
+
+  list(values = values, diagnostics = diagnostics)
+}
+
+.postprocess_garbage <- function(values, variable, diagnostics) {
+  if (length(variable$garbage_rules) == 0) {
+    return(list(values = values, diagnostics = diagnostics))
+  }
+  if (is.null(names(variable$garbage_rules)) || any(names(variable$garbage_rules) == "")) {
+    stop("Variable '", variable$name, "' garbage_rules must be a named list.", call. = FALSE)
+  }
+
+  assigned_missing <- diagnostics$assigned_missing_indices
+  valid_idx <- setdiff(which(!is.na(values)), assigned_missing)
+  remaining_idx <- valid_idx
+
+  requested <- vapply(variable$garbage_rules, function(rule) {
+    proportion <- rule$proportion %||% 0
+    if (is.na(proportion)) {
+      proportion <- 0
+    }
+    as.integer(round(length(valid_idx) * proportion))
+  }, integer(1))
+
+  if (sum(requested) > length(valid_idx)) {
+    stop(
+      "Variable '", variable$name,
+      "' garbage rules request more rows than are available after missing-code assignment.",
+      call. = FALSE
+    )
+  }
+
+  for (rule_name in names(variable$garbage_rules)) {
+    rule <- variable$garbage_rules[[rule_name]]
+    n_assign <- requested[[rule_name]]
+    if (n_assign == 0) {
+      next
+    }
+
+    if (is.null(rule$range) || is.na(rule$range) || trimws(rule$range) == "") {
+      stop("Variable '", variable$name, "' garbage rule '", rule_name, "' is missing a range.", call. = FALSE)
+    }
+
+    assign_idx <- .sample_postprocess_indices(remaining_idx, n_assign)
+    raw_values <- .generate_garbage_for_rule(rule, variable, n_assign)
+    coerced <- .coerce_postprocess_values(raw_values, variable, values)
+    values <- .assign_postprocess_values(values, assign_idx, coerced)
+
+    diagnostics$assigned_garbage_indices[[rule_name]] <- assign_idx
+    diagnostics$assigned_garbage_values[[rule_name]] <- coerced
+    remaining_idx <- setdiff(remaining_idx, assign_idx)
+  }
+
+  list(values = values, diagnostics = diagnostics)
+}
+
+.postprocess_variable <- function(values, variable, diagnostics) {
+  missing_result <- .postprocess_missing(values, variable, diagnostics)
+  garbage_result <- .postprocess_garbage(
+    missing_result$values,
+    variable,
+    missing_result$diagnostics
+  )
+
+  garbage_result
+}
+
+#' Apply mock_spec post-processing rules
+#'
+#' `postprocess_mock_data()` applies v0.4 `mock_spec` missing-code and
+#' garbage-value rules to an already generated baseline data frame. It records a
+#' `mockdata_diagnostics` attribute so downstream checks can distinguish values
+#' assigned by post-processing from values that were drawn naturally by the
+#' baseline generator.
+#'
+#' @param data Data frame with one column for each variable in `spec`.
+#' @param spec A validated `mock_spec` object.
+#' @param seed Optional whole-number random seed. The previous R random state is
+#'   restored after post-processing.
+#' @param diagnostics Logical. If `TRUE`, attach a `mockdata_diagnostics`
+#'   attribute to the returned data frame.
+#'
+#' @return A data frame with post-processing applied.
+#' @family mock generation APIs
+#' @seealso [generate_mock_data_native()], [mock_spec()]
+#'
+#' @examples
+#' spec <- mock_categorical(
+#'   "smoking",
+#'   levels = c("never", "former", "current"),
+#'   proportions = c(0.5, 0.3, 0.2),
+#'   rtype = "character",
+#'   missing_codes = "9",
+#'   missing_proportions = 0.05
+#' )
+#' baseline <- generate_mock_data_native(spec, n = 20, seed = 1)
+#' result <- postprocess_mock_data(baseline, spec, seed = 2)
+#' attr(result, "mockdata_diagnostics")$variables$smoking
+#'
+#' @export
+postprocess_mock_data <- function(data, spec, seed = NULL, diagnostics = TRUE) {
+  if (!is.data.frame(data)) {
+    stop("data must be a data frame.", call. = FALSE)
+  }
+  validate_mock_spec(spec, n = nrow(data), strict = TRUE)
+
+  missing_columns <- setdiff(names(spec$variables), names(data))
+  if (length(missing_columns) > 0) {
+    stop(
+      "data is missing column(s) required by spec: ",
+      paste(missing_columns, collapse = ", "),
+      call. = FALSE
+    )
+  }
+
+  .with_mock_seed(seed, {
+    output <- data
+    diag <- .postprocess_empty_diagnostics(spec, nrow(data))
+
+    for (variable_name in names(spec$variables)) {
+      variable <- spec$variables[[variable_name]]
+      result <- .postprocess_variable(
+        output[[variable_name]],
+        variable,
+        diag$variables[[variable_name]]
+      )
+      output[[variable_name]] <- result$values
+      diag$variables[[variable_name]] <- result$diagnostics
+    }
+
+    if (isTRUE(diagnostics)) {
+      attr(output, "mockdata_diagnostics") <- diag
+    }
+
+    output
+  })
+}
diff --git a/tests/testthat/test-mock-spec-postprocess.R b/tests/testthat/test-mock-spec-postprocess.R
new file mode 100644
index 0000000..b4dc87f
--- /dev/null
+++ b/tests/testthat/test-mock-spec-postprocess.R
@@ -0,0 +1,162 @@
+test_that("postprocess_mock_data distinguishes missing-code collisions", {
+  spec <- mock_categorical(
+    "response",
+    levels = c("1", "97"),
+    proportions = c(0.7, 0.3),
+    rtype = "character",
+    missing_codes = "97",
+    missing_proportions = 0.2
+  )
+  baseline <- generate_mock_data_native(spec, n = 200, seed = 11)
+
+  result <- postprocess_mock_data(baseline, spec, seed = 12)
+  diagnostics <- attr(result, "mockdata_diagnostics")$variables$response
+
+  expect_true(length(diagnostics$preexisting_missing_code_indices) > 0)
+  expect_equal(length(diagnostics$assigned_missing_indices), 40)
+  expect_length(intersect(
+    diagnostics$preexisting_missing_code_indices,
+    diagnostics$assigned_missing_indices
+  ), 0)
+  expect_true(all(result$response[diagnostics$assigned_missing_indices] == "97"))
+  expect_true(any(baseline$response[diagnostics$preexisting_missing_code_indices] == "97"))
+})
+
+test_that("postprocess_mock_data applies integer missing and garbage rules", {
+  spec <- mock_continuous(
+    "age",
+    range = c(18, 85),
+    rtype = "integer",
+    missing_codes = 997,
+    missing_proportions = 0.1,
+    garbage_rules = list(high = list(proportion = 0.05, range = "[150, 200]"))
+  )
+  baseline <- generate_mock_data_native(spec, n = 100, seed = 21)
+
+  result <- postprocess_mock_data(baseline, spec, seed = 22)
+  diagnostics <- attr(result, "mockdata_diagnostics")$variables$age
+  high_idx <- diagnostics$assigned_garbage_indices$high
+
+  expect_type(result$age, "integer")
+  expect_equal(length(diagnostics$assigned_missing_indices), 10)
+  expect_equal(result$age[diagnostics$assigned_missing_indices], rep(997L, 10))
+  expect_equal(length(high_idx), round(90 * 0.05))
+  expect_true(all(result$age[high_idx] >= 150L & result$age[high_idx] <= 200L))
+  expect_length(intersect(high_idx, diagnostics$assigned_missing_indices), 0)
+})
+
+test_that("postprocess_mock_data preserves Date values", {
+  spec <- mock_date(
+    "interview_date",
+    range = as.Date(c("2001-01-01", "2001-01-31")),
+    missing_codes = "2099-01-01",
+    missing_proportions = 0.1,
+    garbage_rules = list(high = list(
+      proportion = 0.1,
+      range = "[2025-01-01, 2025-01-31]"
+    ))
+  )
+  baseline <- generate_mock_data_native(spec, n = 50, seed = 31)
+
+  result <- postprocess_mock_data(baseline, spec, seed = 32)
+  diagnostics <- attr(result, "mockdata_diagnostics")$variables$interview_date
+  high_idx <- diagnostics$assigned_garbage_indices$high
+
+  expect_s3_class(result$interview_date, "Date")
+  expect_equal(
+    result$interview_date[diagnostics$assigned_missing_indices],
+    rep(as.Date("2099-01-01"), 5)
+  )
+  expect_true(all(result$interview_date[high_idx] >= as.Date("2025-01-01")))
+  expect_true(all(result$interview_date[high_idx] <= as.Date("2025-01-31")))
+})
+
+test_that("postprocess_mock_data is reproducible without leaking RNG state", {
+  spec <- mock_continuous(
+    "age",
+    range = c(18, 85),
+    rtype = "integer",
+    missing_codes = 997,
+    missing_proportions = 0.1,
+    garbage_rules = list(high = list(proportion = 0.1, range = "[150, 200]"))
+  )
+  baseline <- generate_mock_data_native(spec, n = 100, seed = 71)
+
+  set.seed(999)
+  before <- runif(1)
+  result_1 <- postprocess_mock_data(baseline, spec, seed = 72)
+  after_1 <- runif(1)
+
+  set.seed(999)
+  expect_equal(runif(1), before)
+  result_2 <- postprocess_mock_data(baseline, spec, seed = 72)
+  after_2 <- runif(1)
+
+  expect_equal(result_1, result_2)
+  expect_equal(after_1, after_2)
+})
+
+test_that("postprocess_mock_data preserves and extends factor levels", {
+  spec <- mock_categorical(
+    "smoking",
+    levels = c("1", "2", "3"),
+    proportions = c(0.5, 0.3, 0.2),
+    missing_codes = "9",
+    missing_proportions = 0.1,
+    garbage_rules = list(low = list(proportion = 0.1, range = "[-2, 0]"))
+  )
+  baseline <- generate_mock_data_native(spec, n = 60, seed = 41)
+
+  result <- postprocess_mock_data(baseline, spec, seed = 42)
+  diagnostics <- attr(result, "mockdata_diagnostics")$variables$smoking
+  low_idx <- diagnostics$assigned_garbage_indices$low
+
+  expect_s3_class(result$smoking, "factor")
+  expect_true("9" %in% levels(result$smoking))
+  expect_true(all(as.character(result$smoking[diagnostics$assigned_missing_indices]) == "9"))
+  expect_true(all(as.character(result$smoking[low_idx]) %in% c("-2", "-1", "0")))
+})
+
+test_that("postprocess_mock_data rejects unnamed garbage rules", {
+  spec <- mock_continuous(
+    "age",
+    range = c(18, 85),
+    garbage_rules = list(list(proportion = 0.1, range = "[150, 200]"))
+  )
+  baseline <- generate_mock_data_native(spec, n = 10, seed = 81)
+
+  expect_error(
+    postprocess_mock_data(baseline, spec, seed = 82),
+    "named list"
+  )
+})
+
+test_that("postprocess_mock_data rejects impossible garbage requests", {
+  spec <- mock_continuous(
+    "age",
+    range = c(18, 85),
+    garbage_rules = list(
+      low = list(proportion = 0.8, range = "[-10, 0]"),
+      high = list(proportion = 0.8, range = "[150, 200]")
+    )
+  )
+  baseline <- generate_mock_data_native(spec, n = 10, seed = 51)
+
+  expect_error(
+    postprocess_mock_data(baseline, spec, seed = 52),
+    "request more rows"
+  )
+})
+
+test_that("postprocess_mock_data validates input shape and diagnostics opt-out", {
+  spec <- mock_continuous("age", range = c(18, 85))
+  baseline <- generate_mock_data_native(spec, n = 10, seed = 61)
+
+  expect_error(
+    postprocess_mock_data(data.frame(other = 1:10), spec),
+    "missing column"
+  )
+
+  result <- postprocess_mock_data(baseline, spec, diagnostics = FALSE)
+  expect_null(attr(result, "mockdata_diagnostics"))
+})

From 6daef47cc411115d2b2d898591a48a2e9dde2cd7 Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 19:50:53 -0400
Subject: [PATCH 10/18] Harden mock_spec post-processing diagnostics

---
 R/mock_spec_postprocess.R                   | 64 +++++++++++++++--
 tests/testthat/test-mock-spec-postprocess.R | 76 ++++++++++++++++++++-
 2 files changed, 133 insertions(+), 7 deletions(-)

diff --git a/R/mock_spec_postprocess.R b/R/mock_spec_postprocess.R
index f71a6d5..fc1b671 100644
--- a/R/mock_spec_postprocess.R
+++ b/R/mock_spec_postprocess.R
@@ -8,13 +8,27 @@
 
 .postprocess_empty_diagnostics <- function(spec, n) {
   variables <- lapply(spec$variables, function(variable) {
+    garbage_rule_names <- names(variable$garbage_rules)
+    if (is.null(garbage_rule_names)) {
+      garbage_rule_names <- character(0)
+    }
+    garbage_rule_names <- .ordered_garbage_rule_names(garbage_rule_names)
+    garbage_indices <- stats::setNames(
+      rep(list(integer(0)), length(garbage_rule_names)),
+      garbage_rule_names
+    )
+    garbage_values <- stats::setNames(
+      rep(list(character(0)), length(garbage_rule_names)),
+      garbage_rule_names
+    )
+
     list(
       n = n,
       preexisting_missing_code_indices = integer(0),
       assigned_missing_indices = integer(0),
       assigned_missing_codes = character(0),
-      assigned_garbage_indices = list(low = integer(0), high = integer(0)),
-      assigned_garbage_values = list(low = character(0), high = character(0))
+      assigned_garbage_indices = garbage_indices,
+      assigned_garbage_values = garbage_values
     )
   })
 
@@ -138,10 +152,21 @@
   values
 }
 
+.ordered_garbage_rule_names <- function(rule_names) {
+  c(intersect(c("low", "high"), rule_names), setdiff(rule_names, c("low", "high")))
+}
+
 .postprocess_missing <- function(values, variable, diagnostics) {
   if (length(variable$missing_codes) == 0) {
     return(list(values = values, diagnostics = diagnostics))
   }
+  if (sum(variable$missing_proportions) > 1 + .mock_spec_probability_tolerance) {
+    stop(
+      "Variable '", variable$name,
+      "' missing proportions request more rows than are available.",
+      call. = FALSE
+    )
+  }
 
   available <- seq_along(values)
   preexisting <- which(.values_match_codes(values, variable$missing_codes))
@@ -182,11 +207,25 @@
     return(list(values = values, diagnostics = diagnostics))
   }
   if (is.null(names(variable$garbage_rules)) || any(names(variable$garbage_rules) == "")) {
-    stop("Variable '", variable$name, "' garbage_rules must be a named list.", call. = FALSE)
+    rule_names <- names(variable$garbage_rules)
+    unnamed <- if (is.null(rule_names)) {
+      seq_along(variable$garbage_rules)
+    } else {
+      which(rule_names == "")
+    }
+    stop(
+      "Variable '", variable$name, "' garbage_rules must be a named list; ",
+      "unnamed rule index: ", paste(unnamed, collapse = ", "),
+      ".",
+      call. = FALSE
+    )
   }
 
-  assigned_missing <- diagnostics$assigned_missing_indices
-  valid_idx <- setdiff(which(!is.na(values)), assigned_missing)
+  protected_idx <- union(
+    diagnostics$assigned_missing_indices,
+    diagnostics$preexisting_missing_code_indices
+  )
+  valid_idx <- setdiff(which(!is.na(values)), protected_idx)
   remaining_idx <- valid_idx
 
   requested <- vapply(variable$garbage_rules, function(rule) {
@@ -205,7 +244,7 @@
     )
   }
 
-  for (rule_name in names(variable$garbage_rules)) {
+  for (rule_name in .ordered_garbage_rule_names(names(variable$garbage_rules))) {
     rule <- variable$garbage_rules[[rule_name]]
     n_assign <- requested[[rule_name]]
     if (n_assign == 0) {
@@ -256,6 +295,12 @@
 #'   attribute to the returned data frame.
 #'
 #' @return A data frame with post-processing applied.
+#'
+#' @details
+#' Diagnostics are stored as a data-frame attribute. Base R subsetting and some
+#' downstream tools may drop attributes, so preserve the original post-processed
+#' object when diagnostics are part of the audit trail.
+#'
 #' @family mock generation APIs
 #' @seealso [generate_mock_data_native()], [mock_spec()]
 #'
@@ -277,6 +322,13 @@ postprocess_mock_data <- function(data, spec, seed = NULL, diagnostics = TRUE) {
   if (!is.data.frame(data)) {
     stop("data must be a data frame.", call. = FALSE)
   }
+  if (!is.null(attr(data, "mockdata_diagnostics"))) {
+    stop(
+      "postprocess_mock_data() appears to have already run on this data. ",
+      "Start from baseline generated data to avoid double post-processing.",
+      call. = FALSE
+    )
+  }
   validate_mock_spec(spec, n = nrow(data), strict = TRUE)
 
   missing_columns <- setdiff(names(spec$variables), names(data))
diff --git a/tests/testthat/test-mock-spec-postprocess.R b/tests/testthat/test-mock-spec-postprocess.R
index b4dc87f..cd942b4 100644
--- a/tests/testthat/test-mock-spec-postprocess.R
+++ b/tests/testthat/test-mock-spec-postprocess.R
@@ -22,6 +22,47 @@ test_that("postprocess_mock_data distinguishes missing-code collisions", {
   expect_true(any(baseline$response[diagnostics$preexisting_missing_code_indices] == "97"))
 })
 
+test_that("postprocess_mock_data does not overwrite preexisting missing-code collisions with garbage", {
+  spec <- mock_categorical(
+    "response",
+    levels = c("1", "97"),
+    proportions = c(0.5, 0.5),
+    rtype = "character",
+    missing_codes = "97",
+    missing_proportions = 0,
+    garbage_rules = list(low = list(proportion = 1, range = "[-2, 0]"))
+  )
+  baseline <- generate_mock_data_native(spec, n = 100, seed = 13)
+  expect_true(any(baseline$response == "97"))
+
+  result <- postprocess_mock_data(baseline, spec, seed = 14)
+  diagnostics <- attr(result, "mockdata_diagnostics")$variables$response
+
+  preexisting <- diagnostics$preexisting_missing_code_indices
+  garbage <- diagnostics$assigned_garbage_indices$low
+
+  expect_length(intersect(preexisting, garbage), 0)
+  expect_true(all(result$response[preexisting] == "97"))
+})
+
+test_that("postprocess_mock_data rejects missing-proportion overflow", {
+  spec <- mock_categorical(
+    "response",
+    levels = c("1", "2"),
+    proportions = c(0.5, 0.5),
+    rtype = "character",
+    missing_codes = c("97", "98"),
+    missing_proportions = c(0.1, 0.1)
+  )
+  spec$variables$response$missing_proportions <- c(0.6, 0.6)
+  baseline <- data.frame(response = rep(c("1", "2"), each = 10))
+
+  expect_error(
+    postprocess_mock_data(baseline, spec, seed = 16),
+    "missing proportions must sum"
+  )
+})
+
 test_that("postprocess_mock_data applies integer missing and garbage rules", {
   spec <- mock_continuous(
     "age",
@@ -117,6 +158,23 @@ test_that("postprocess_mock_data preserves and extends factor levels", {
   expect_true(all(as.character(result$smoking[low_idx]) %in% c("-2", "-1", "0")))
 })
 
+test_that("postprocess_mock_data applies garbage rules in canonical order", {
+  spec <- mock_continuous(
+    "age",
+    range = c(18, 85),
+    garbage_rules = list(
+      high = list(proportion = 0.1, range = "[150, 200]"),
+      low = list(proportion = 0.1, range = "[-10, 0]")
+    )
+  )
+  baseline <- generate_mock_data_native(spec, n = 20, seed = 81)
+
+  result <- postprocess_mock_data(baseline, spec, seed = 82)
+  diagnostics <- attr(result, "mockdata_diagnostics")$variables$age
+
+  expect_equal(names(diagnostics$assigned_garbage_indices), c("low", "high"))
+})
+
 test_that("postprocess_mock_data rejects unnamed garbage rules", {
   spec <- mock_continuous(
     "age",
@@ -127,7 +185,7 @@ test_that("postprocess_mock_data rejects unnamed garbage rules", {
 
   expect_error(
     postprocess_mock_data(baseline, spec, seed = 82),
-    "named list"
+    "unnamed rule index: 1"
   )
 })
 
@@ -160,3 +218,19 @@ test_that("postprocess_mock_data validates input shape and diagnostics opt-out",
   result <- postprocess_mock_data(baseline, spec, diagnostics = FALSE)
   expect_null(attr(result, "mockdata_diagnostics"))
 })
+
+test_that("postprocess_mock_data rejects idempotent re-call", {
+  spec <- mock_continuous(
+    "age",
+    range = c(18, 85),
+    missing_codes = 997,
+    missing_proportions = 0.1
+  )
+  baseline <- generate_mock_data_native(spec, n = 20, seed = 91)
+  result <- postprocess_mock_data(baseline, spec, seed = 92)
+
+  expect_error(
+    postprocess_mock_data(result, spec, seed = 93),
+    "already run"
+  )
+})

From 2b6aa1e2aedf7e41d883c10a54b990532863d47e Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Mon, 18 May 2026 19:55:11 -0400
Subject: [PATCH 11/18] Promote mock_spec pipeline assertions

---
 tests/testthat/test-mock-spec-pipeline.R | 145 +++++++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 tests/testthat/test-mock-spec-pipeline.R

diff --git a/tests/testthat/test-mock-spec-pipeline.R b/tests/testthat/test-mock-spec-pipeline.R
new file mode 100644
index 0000000..a53cf26
--- /dev/null
+++ b/tests/testthat/test-mock-spec-pipeline.R
@@ -0,0 +1,145 @@
+run_native_pipeline <- function(spec, n, seed) {
+  baseline <- generate_mock_data_native(spec, n = n, seed = seed)
+  postprocess_mock_data(baseline, spec, seed = seed + 1)
+}
+
+test_that("native mock_spec pipeline preserves categorical codes and diagnostics", {
+  spec <- mock_categorical(
+    "response",
+    levels = c("1", "97"),
+    proportions = c(0.55, 0.45),
+    rtype = "character",
+    missing_codes = "97",
+    missing_proportions = 0.2,
+    garbage_rules = list(low = list(proportion = 0.4, range = "[-2, 0]"))
+  )
+
+  result <- run_native_pipeline(spec, n = 200, seed = 101)
+  diagnostics <- attr(result, "mockdata_diagnostics")$variables$response
+
+  expect_true(all(result$response %in% c("1", "97", "-2", "-1", "0")))
+  expect_equal(length(diagnostics$assigned_missing_indices), 40)
+  expect_true(length(diagnostics$preexisting_missing_code_indices) > 0)
+  expect_length(intersect(
+    diagnostics$preexisting_missing_code_indices,
+    diagnostics$assigned_missing_indices
+  ), 0)
+  expect_length(intersect(
+    diagnostics$preexisting_missing_code_indices,
+    diagnostics$assigned_garbage_indices$low
+  ), 0)
+  expect_true(all(result$response[diagnostics$preexisting_missing_code_indices] == "97"))
+  expect_true(all(result$response[diagnostics$assigned_missing_indices] == "97"))
+})
+
+test_that("native mock_spec pipeline is reproducible as a composed workflow", {
+  spec <- mock_spec(
+    mock_spec_continuous(
+      "age",
+      range = c(18, 85),
+      distribution = "normal",
+      mean = 50,
+      sd = 12,
+      rtype = "integer",
+      missing_codes = 997,
+      missing_proportions = 0.05,
+      garbage_rules = list(high = list(proportion = 0.05, range = "[150, 200]"))
+    ),
+    mock_spec_categorical(
+      "smoking",
+      levels = c("never", "former", "current"),
+      proportions = c(0.5, 0.3, 0.2),
+      rtype = "character"
+    )
+  )
+
+  first <- run_native_pipeline(spec, n = 100, seed = 202)
+  second <- run_native_pipeline(spec, n = 100, seed = 202)
+
+  expect_equal(first, second)
+  expect_equal(
+    attr(first, "mockdata_diagnostics"),
+    attr(second, "mockdata_diagnostics")
+  )
+})
+
+test_that("recodeflow pipeline preserves recEnd-driven missingness and garbage", {
+  variables <- data.frame(
+    variable = "smoking",
+    variableType = "Categorical",
+    rType = "character",
+    role = "enabled",
+    garbage_low_prop = 0.1,
+    garbage_low_range = "[-2, 0]",
+    stringsAsFactors = FALSE
+  )
+  variable_details <- data.frame(
+    variable = "smoking",
+    recStart = c("1", "2", "97", "99"),
+    recEnd = c("copy", "copy", "NA::b", "NA::b"),
+    proportion = c(0.5, 0.3, 0.1, 0.1),
+    stringsAsFactors = FALSE
+  )
+  spec <- mock_spec_from_recodeflow(variables, variable_details)
+
+  result <- run_native_pipeline(spec, n = 100, seed = 303)
+  diagnostics <- attr(result, "mockdata_diagnostics")$variables$smoking
+
+  expect_equal(spec$variables$smoking$levels, c("1", "2"))
+  expect_equal(spec$variables$smoking$missing_codes, c("97", "99"))
+  expect_equal(length(diagnostics$assigned_missing_indices), 20)
+  expect_equal(length(diagnostics$assigned_garbage_indices$low), 8)
+  expect_true(all(result$smoking[diagnostics$assigned_missing_indices] %in% c("97", "99")))
+  expect_true(all(result$smoking[diagnostics$assigned_garbage_indices$low] %in% c("-2", "-1", "0")))
+  expect_length(intersect(
+    diagnostics$assigned_missing_indices,
+    diagnostics$assigned_garbage_indices$low
+  ), 0)
+})
+
+test_that("direct and recodeflow pipelines agree for equivalent specs", {
+  direct_spec <- mock_categorical(
+    "smoking",
+    levels = c("1", "2"),
+    proportions = c(0.625, 0.375),
+    rtype = "character",
+    missing_codes = c("97", "99"),
+    missing_proportions = c(0.1, 0.1),
+    garbage_rules = list(low = list(proportion = 0.1, range = "[-2, 0]"))
+  )
+  variables <- data.frame(
+    variable = "smoking",
+    variableType = "Categorical",
+    rType = "character",
+    role = "enabled",
+    garbage_low_prop = 0.1,
+    garbage_low_range = "[-2, 0]",
+    stringsAsFactors = FALSE
+  )
+  variable_details <- data.frame(
+    variable = "smoking",
+    recStart = c("1", "2", "97", "99"),
+    recEnd = c("copy", "copy", "NA::b", "NA::b"),
+    proportion = c(0.5, 0.3, 0.1, 0.1),
+    stringsAsFactors = FALSE
+  )
+  recodeflow_spec <- mock_spec_from_recodeflow(variables, variable_details)
+
+  direct_result <- run_native_pipeline(direct_spec, n = 100, seed = 404)
+  recodeflow_result <- run_native_pipeline(recodeflow_spec, n = 100, seed = 404)
+
+  attr(direct_result, "mockdata_diagnostics") <- NULL
+  attr(recodeflow_result, "mockdata_diagnostics") <- NULL
+  expect_equal(direct_result, recodeflow_result)
+})
+
+test_that("pipeline keeps deferred formula variables loud", {
+  variable <- mock_spec_continuous("bmi", range = c(15, 50))
+  variable$formula <- "weight / height^2"
+  spec <- mock_spec(variable)
+
+  expect_error(
+    run_native_pipeline(spec, n = 10, seed = 505),
+    "Formula evaluation is not yet implemented"
+  )
+})

From c28354a2ef1bfa273d8488cb0c60e0ea3e1ec81e Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Tue, 19 May 2026 19:37:32 -0400
Subject: [PATCH 12/18] Add optional simstudy mock_spec backend

---
 DESCRIPTION                            |   1 +
 NAMESPACE                              |   1 +
 NEWS.md                                |   3 +
 R/mock_spec_simstudy.R                 | 169 +++++++++++++++++++++++++
 tests/testthat/test-simstudy-backend.R | 108 ++++++++++++++++
 5 files changed, 282 insertions(+)
 create mode 100644 R/mock_spec_simstudy.R
 create mode 100644 tests/testthat/test-simstudy-backend.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 6a79c59..3f5a388 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -32,6 +32,7 @@ Suggests:
     readr,
     stringr,
     lubridate,
+    simstudy,
     knitr,
     quarto,
     devtools,
diff --git a/NAMESPACE b/NAMESPACE
index 52cae5a..987afd8 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -16,6 +16,7 @@ export(extract_distribution_params)
 export(extract_proportions)
 export(generate_garbage_values)
 export(generate_mock_data_native)
+export(generate_mock_data_simstudy)
 export(get_cycle_variables)
 export(get_enabled_variables)
 export(get_raw_var_dependencies)
diff --git a/NEWS.md b/NEWS.md
index fe6fda1..f5aa558 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -19,6 +19,9 @@
 - Added `postprocess_mock_data()` to apply `mock_spec` missing-code and
   garbage-value rules after baseline generation, with diagnostics that
   distinguish assigned missing/garbage rows from naturally drawn values.
+- Added `generate_mock_data_simstudy()` as a soft-gated optional backend for
+  baseline categorical and uniform continuous generation when `simstudy` is
+  installed, with native generation retained for MockData-specific semantics.
 - Added forward-compatible specification fields: `spec_version`, `provenance`,
   and `model_hint`.
 - Existing v0.3 generator APIs remain available while v0.4 internals are built.
diff --git a/R/mock_spec_simstudy.R b/R/mock_spec_simstudy.R
new file mode 100644
index 0000000..776dce4
--- /dev/null
+++ b/R/mock_spec_simstudy.R
@@ -0,0 +1,169 @@
+# ==============================================================================
+# MockData v0.4 Optional simstudy Backend
+# ==============================================================================
+# Baseline generation from mock_spec using simstudy when the optional package is
+# installed. MockData still owns post-processing and diagnostics.
+# ==============================================================================
+
+.require_simstudy <- function() {
+  if (!requireNamespace("simstudy", quietly = TRUE)) {
+    stop(
+      "The optional simstudy backend requires the 'simstudy' package. ",
+      "Install simstudy or use generate_mock_data_native().",
+      call. = FALSE
+    )
+  }
+
+  invisible(TRUE)
+}
+
+.simstudy_definition <- function(def, variable) {
+  if (variable$type == "continuous") {
+    distribution <- tolower(variable$distribution %||% "uniform")
+    if (distribution == "uniform") {
+      return(simstudy::defData(
+        dtDefs = def,
+        varname = variable$name,
+        formula = paste(variable$range, collapse = ";"),
+        dist = "uniform"
+      ))
+    }
+  }
+
+  if (variable$type == "categorical") {
+    probabilities <- variable$proportions
+    if (is.null(probabilities)) {
+      probabilities <- rep(1 / length(variable$levels), length(variable$levels))
+    }
+
+    return(simstudy::defData(
+      dtDefs = def,
+      varname = variable$name,
+      formula = paste(probabilities, collapse = ";"),
+      variance = paste(variable$levels, collapse = ";"),
+      dist = "categorical"
+    ))
+  }
+
+  stop(
+    "simstudy backend does not yet support variable '", variable$name,
+    "' of type '", variable$type, "'.",
+    call. = FALSE
+  )
+}
+
+.simstudy_can_generate <- function(variable) {
+  if (variable$type == "categorical") {
+    return(TRUE)
+  }
+
+  variable$type == "continuous" &&
+    identical(tolower(variable$distribution %||% "uniform"), "uniform")
+}
+
+.simstudy_variables <- function(spec) {
+  Filter(.simstudy_can_generate, spec$variables)
+}
+
+.native_only_variables <- function(spec) {
+  Filter(function(variable) !.simstudy_can_generate(variable), spec$variables)
+}
+
+.generate_simstudy_baseline <- function(variables, n) {
+  if (length(variables) == 0) {
+    return(.empty_native_data(n))
+  }
+
+  def <- NULL
+  for (variable in variables) {
+    def <- .simstudy_definition(def, variable)
+  }
+
+  generated <- as.data.frame(simstudy::genData(n, def), stringsAsFactors = FALSE)
+  generated <- generated[, names(variables), drop = FALSE]
+
+  for (variable_name in names(variables)) {
+    variable <- variables[[variable_name]]
+    if (variable$type == "continuous") {
+      generated[[variable_name]] <- .coerce_native_continuous(
+        generated[[variable_name]],
+        variable$rtype,
+        variable$name
+      )
+    } else if (variable$type == "categorical") {
+      generated[[variable_name]] <- .coerce_native_categorical(
+        as.character(generated[[variable_name]]),
+        as.character(variable$levels),
+        variable$rtype,
+        variable$name
+      )
+    }
+  }
+
+  generated
+}
+
+.generate_native_only_baseline <- function(variables, n) {
+  if (length(variables) == 0) {
+    return(.empty_native_data(n))
+  }
+
+  columns <- lapply(variables, .generate_native_variable, n = n)
+  names(columns) <- names(variables)
+  as.data.frame(columns, stringsAsFactors = FALSE, check.names = FALSE)
+}
+
+#' Generate mock data with the optional simstudy backend
+#'
+#' `generate_mock_data_simstudy()` consumes a validated `mock_spec` and
+#' generates baseline valid values through the optional `simstudy` package for
+#' supported uniform continuous and categorical variables. MockData remains
+#' responsible for missing-code injection, garbage values, and diagnostics
+#' through [postprocess_mock_data()].
+#'
+#' Variables that need MockData semantics not covered by this milestone, such as
+#' truncated normal ranges and calendar dates, are generated by MockData's native
+#' path inside the same seeded call.
+#'
+#' @param spec A `mock_spec` object.
+#' @param n Non-negative whole number of rows to generate.
+#' @param seed Optional whole-number random seed. The previous R random state is
+#'   restored after generation.
+#'
+#' @return A data frame with one column per `mock_spec` variable and `n` rows.
+#' @family mock generation APIs
+#' @seealso [generate_mock_data_native()], [postprocess_mock_data()],
+#'   [mock_spec()]
+#'
+#' @examples
+#' spec <- mock_continuous("age", range = c(18, 85), rtype = "integer")
+#' if (requireNamespace("simstudy", quietly = TRUE)) {
+#'   data <- generate_mock_data_simstudy(spec, n = 10, seed = 1)
+#'   head(data)
+#' }
+#'
+#' @export
+generate_mock_data_simstudy <- function(spec, n, seed = NULL) {
+  .require_simstudy()
+  validate_mock_spec(spec, n = n, strict = TRUE)
+  .check_native_backend_scope(spec)
+
+  simstudy_variables <- .simstudy_variables(spec)
+  native_only_variables <- .native_only_variables(spec)
+
+  .with_mock_seed(seed, {
+    simstudy_data <- .generate_simstudy_baseline(simstudy_variables, n)
+    native_data <- .generate_native_only_baseline(native_only_variables, n)
+
+    columns <- c(simstudy_data, native_data)
+    if (length(spec$variables) == 0) {
+      return(.empty_native_data(n))
+    }
+
+    as.data.frame(
+      columns[names(spec$variables)],
+      stringsAsFactors = FALSE,
+      check.names = FALSE
+    )
+  })
+}
diff --git a/tests/testthat/test-simstudy-backend.R b/tests/testthat/test-simstudy-backend.R
new file mode 100644
index 0000000..e479d44
--- /dev/null
+++ b/tests/testthat/test-simstudy-backend.R
@@ -0,0 +1,108 @@
+test_that("generate_mock_data_simstudy fails clearly when simstudy is unavailable", {
+  if (requireNamespace("simstudy", quietly = TRUE)) {
+    skip("simstudy is installed; unavailable-path test is not applicable")
+  }
+
+  spec <- mock_continuous("age", range = c(18, 85))
+
+  expect_error(
+    generate_mock_data_simstudy(spec, n = 10, seed = 1),
+    "requires the 'simstudy' package"
+  )
+})
+
+test_that("generate_mock_data_simstudy generates supported baseline specs", {
+  skip_if_not_installed("simstudy")
+
+  spec <- mock_spec(
+    mock_spec_continuous("age", range = c(18, 85), rtype = "integer"),
+    mock_spec_continuous(
+      "bmi",
+      range = c(15, 50),
+      distribution = "normal",
+      mean = 27,
+      sd = 5
+    ),
+    mock_spec_categorical(
+      "smoking",
+      levels = c("never", "former", "current"),
+      proportions = c(0.5, 0.3, 0.2),
+      rtype = "character"
+    ),
+    mock_spec_date("interview_date", range = as.Date(c("2001-01-01", "2001-01-31")))
+  )
+
+  result <- generate_mock_data_simstudy(spec, n = 1000, seed = 707)
+
+  expect_named(result, c("age", "bmi", "smoking", "interview_date"))
+  expect_type(result$age, "integer")
+  expect_type(result$bmi, "double")
+  expect_type(result$smoking, "character")
+  expect_s3_class(result$interview_date, "Date")
+  expect_true(all(result$age >= 18 & result$age <= 85))
+  expect_true(all(result$bmi >= 15 & result$bmi <= 50))
+  expect_equal(mean(result$bmi), 27, tolerance = 1)
+
+  observed <- prop.table(table(factor(
+    result$smoking,
+    levels = c("never", "former", "current")
+  )))
+  expect_equal(as.numeric(observed), c(0.5, 0.3, 0.2), tolerance = 0.05)
+})
+
+test_that("generate_mock_data_simstudy composes with MockData post-processing", {
+  skip_if_not_installed("simstudy")
+
+  spec <- mock_categorical(
+    "response",
+    levels = c("1", "97"),
+    proportions = c(0.6, 0.4),
+    rtype = "character",
+    missing_codes = "97",
+    missing_proportions = 0.2,
+    garbage_rules = list(low = list(proportion = 0.2, range = "[-2, 0]"))
+  )
+
+  baseline <- generate_mock_data_simstudy(spec, n = 100, seed = 808)
+  result <- postprocess_mock_data(baseline, spec, seed = 809)
+  diagnostics <- attr(result, "mockdata_diagnostics")$variables$response
+
+  expect_equal(length(diagnostics$assigned_missing_indices), 20)
+  expect_true(length(diagnostics$preexisting_missing_code_indices) > 0)
+  expect_length(intersect(
+    diagnostics$preexisting_missing_code_indices,
+    diagnostics$assigned_garbage_indices$low
+  ), 0)
+})
+
+test_that("generate_mock_data_simstudy is reproducible", {
+  skip_if_not_installed("simstudy")
+
+  spec <- mock_spec(
+    mock_spec_continuous("age", range = c(18, 85), rtype = "integer"),
+    mock_spec_categorical(
+      "smoking",
+      levels = c("never", "former", "current"),
+      proportions = c(0.5, 0.3, 0.2),
+      rtype = "character"
+    )
+  )
+
+  first <- generate_mock_data_simstudy(spec, n = 100, seed = 909)
+  second <- generate_mock_data_simstudy(spec, n = 100, seed = 909)
+
+  expect_equal(first, second)
+})
+
+test_that("generate_mock_data_simstudy keeps deferred formula variables loud", {
+  skip_if_not_installed("simstudy")
+
+  variable <- mock_spec_continuous("bmi", range = c(15, 50))
+  variable$formula <- "weight / height^2"
+  spec <- mock_spec(variable)
+
+  expect_error(
+    generate_mock_data_simstudy(spec, n = 10, seed = 1),
+    "Formula evaluation is not yet implemented"
+  )
+})

From 70da7b85815dc165b575856c073eb16d4f103997 Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Tue, 19 May 2026 19:57:20 -0400
Subject: [PATCH 13/18] Harden optional simstudy backend

---
 DESCRIPTION                            |   2 +-
 R/mock_spec_simstudy.R                 |  51 +++++++++-
 tests/testthat/test-simstudy-backend.R | 127 ++++++++++++++++++++++++-
 3 files changed, 177 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 3f5a388..dbb0f20 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -32,7 +32,7 @@ Suggests:
     readr,
     stringr,
     lubridate,
-    simstudy,
+    simstudy (>= 0.8.1),
     knitr,
     quarto,
     devtools,
diff --git a/R/mock_spec_simstudy.R b/R/mock_spec_simstudy.R
index 776dce4..2535623 100644
--- a/R/mock_spec_simstudy.R
+++ b/R/mock_spec_simstudy.R
@@ -13,11 +13,40 @@
       call. = FALSE
     )
   }
+  if (utils::packageVersion("simstudy") < "0.8.1") {
+    stop(
+      "The optional simstudy backend requires simstudy >= 0.8.1. ",
+      "Install a newer simstudy version or use generate_mock_data_native().",
+      call. = FALSE
+    )
+  }
+
+  invisible(TRUE)
+}
+
+.check_simstudy_variable <- function(variable) {
+  if (identical(variable$name, "id")) {
+    stop(
+      "Variable name 'id' conflicts with simstudy's generated row identifier. ",
+      "Rename the variable or use generate_mock_data_native().",
+      call. = FALSE
+    )
+  }
+
+  if (variable$type == "categorical" && any(grepl(";", as.character(variable$levels), fixed = TRUE))) {
+    stop(
+      "Variable '", variable$name,
+      "' has categorical level(s) containing ';', which simstudy uses as a delimiter.",
+      call. = FALSE
+    )
+  }
 
   invisible(TRUE)
 }
 
 .simstudy_definition <- function(def, variable) {
+  .check_simstudy_variable(variable)
+
   if (variable$type == "continuous") {
     distribution <- tolower(variable$distribution %||% "uniform")
     if (distribution == "uniform") {
@@ -52,6 +81,26 @@
   )
 }
 
+.normalize_simstudy_categorical <- function(values, variable) {
+  value_chr <- as.character(values)
+  levels <- as.character(variable$levels)
+  if (all(value_chr %in% levels)) {
+    return(value_chr)
+  }
+
+  index <- suppressWarnings(as.integer(value_chr))
+  if (!any(is.na(index)) && all(index >= 1 & index <= length(levels))) {
+    return(levels[index])
+  }
+
+  stop(
+    "simstudy returned categorical values for variable '", variable$name,
+    "' that do not match the mock_spec levels. ",
+    "This may indicate a simstudy version or delimiter mismatch.",
+    call. = FALSE
+  )
+}
+
 .simstudy_can_generate <- function(variable) {
   if (variable$type == "categorical") {
     return(TRUE)
@@ -92,7 +141,7 @@
       )
     } else if (variable$type == "categorical") {
       generated[[variable_name]] <- .coerce_native_categorical(
-        as.character(generated[[variable_name]]),
+        .normalize_simstudy_categorical(generated[[variable_name]], variable),
         as.character(variable$levels),
         variable$rtype,
         variable$name
diff --git a/tests/testthat/test-simstudy-backend.R b/tests/testthat/test-simstudy-backend.R
index e479d44..93924b5 100644
--- a/tests/testthat/test-simstudy-backend.R
+++ b/tests/testthat/test-simstudy-backend.R
@@ -50,6 +50,114 @@ test_that("generate_mock_data_simstudy generates supported baseline specs", {
   expect_equal(as.numeric(observed), c(0.5, 0.3, 0.2), tolerance = 0.05)
 })
 
+test_that("generate_mock_data_simstudy protects simstudy-specific categorical contracts", {
+  skip_if_not_installed("simstudy")
+
+  semicolon <- mock_categorical(
+    "group",
+    levels = c("never;former", "current"),
+    proportions = c(0.5, 0.5),
+    rtype = "character"
+  )
+  expect_error(
+    generate_mock_data_simstudy(semicolon, n = 10, seed = 1),
+    "containing ';'"
+  )
+
+  reserved <- mock_categorical(
+    "id",
+    levels = c("a", "b"),
+    proportions = c(0.5, 0.5),
+    rtype = "character"
+  )
+  expect_error(
+    generate_mock_data_simstudy(reserved, n = 10, seed = 1),
+    "conflicts with simstudy"
+  )
+})
+
+test_that("simstudy categorical normalization handles labels and old integer indices", {
+  variable <- mock_spec_categorical(
+    "smoking",
+    levels = c("never", "former", "current"),
+    proportions = c(0.5, 0.3, 0.2),
+    rtype = "character"
+  )
+
+  expect_equal(
+    MockData:::.normalize_simstudy_categorical(c("never", "current"), variable),
+    c("never", "current")
+  )
+  expect_equal(
+    MockData:::.normalize_simstudy_categorical(c(1L, 3L), variable),
+    c("never", "current")
+  )
+  expect_error(
+    MockData:::.normalize_simstudy_categorical(c("mystery"), variable),
+    "do not match"
+  )
+})
+
+test_that("generate_mock_data_simstudy roughly matches native contracts when installed", {
+  skip_if_not_installed("simstudy")
+
+  spec <- mock_spec(
+    mock_spec_continuous("age", range = c(18, 85), rtype = "integer"),
+    mock_spec_categorical(
+      "smoking",
+      levels = c("never", "former", "current"),
+      proportions = c(0.5, 0.3, 0.2),
+      rtype = "character"
+    )
+  )
+
+  native <- generate_mock_data_native(spec, n = 5000, seed = 1001)
+  simstudy <- generate_mock_data_simstudy(spec, n = 5000, seed = 1001)
+
+  expect_named(simstudy, names(native))
+  expect_type(simstudy$age, "integer")
+  expect_type(simstudy$smoking, "character")
+  expect_equal(mean(simstudy$age), mean(native$age), tolerance = 2)
+  expect_equal(stats::sd(simstudy$age), stats::sd(native$age), tolerance = 2)
+
+  observed <- prop.table(table(factor(
+    simstudy$smoking,
+    levels = c("never", "former", "current")
+  )))
+  expect_equal(as.numeric(observed), c(0.5, 0.3, 0.2), tolerance = 0.05)
+})
+
+test_that("generate_mock_data_simstudy routes unsupported pieces through native backend", {
+  skip_if_not_installed("simstudy")
+
+  spec <- mock_spec(
+    mock_spec_categorical(
+      "smoking",
+      levels = c("never", "former", "current"),
+      proportions = c(0.5, 0.3, 0.2),
+      rtype = "character"
+    ),
+    mock_spec_continuous(
+      "bmi",
+      range = c(15, 50),
+      distribution = "normal",
+      mean = 27,
+      sd = 5
+    ),
+    mock_spec_date("interview_date", range = as.Date(c("2001-01-01", "2001-01-31")))
+  )
+
+  expect_false(MockData:::.simstudy_can_generate(spec$variables$bmi))
+  expect_false(MockData:::.simstudy_can_generate(spec$variables$interview_date))
+
+  result <- generate_mock_data_simstudy(spec, n = 1000, seed = 1002)
+  expect_true(all(result$bmi >= 15 & result$bmi <= 50))
+  expect_equal(mean(result$bmi), 27, tolerance = 1)
+  expect_s3_class(result$interview_date, "Date")
+  expect_true(all(result$interview_date >= as.Date("2001-01-01")))
+  expect_true(all(result$interview_date <= as.Date("2001-01-31")))
+})
+
 test_that("generate_mock_data_simstudy composes with MockData post-processing", {
   skip_if_not_installed("simstudy")
 
@@ -91,7 +199,24 @@ test_that("generate_mock_data_simstudy is reproducible", {
   first <- generate_mock_data_simstudy(spec, n = 100, seed = 909)
   second <- generate_mock_data_simstudy(spec, n = 100, seed = 909)
 
-  expect_equal(first, second)
+  expect_identical(first, second)
+})
+
+test_that("generate_mock_data_simstudy handles empty specs and n = 0", {
+  skip_if_not_installed("simstudy")
+
+  empty <- generate_mock_data_simstudy(mock_spec(), n = 5, seed = 1)
+  expect_s3_class(empty, "data.frame")
+  expect_equal(nrow(empty), 5)
+  expect_equal(ncol(empty), 0)
+
+  zero <- generate_mock_data_simstudy(
+    mock_categorical("smoking", levels = c("never", "former", "current")),
+    n = 0,
+    seed = 1
+  )
+  expect_equal(nrow(zero), 0)
+  expect_named(zero, "smoking")
 })
 
 test_that("generate_mock_data_simstudy keeps deferred formula variables loud", {

From e109d546865fa925da454ebb2fac24b529b5b405 Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Tue, 19 May 2026 20:06:09 -0400
Subject: [PATCH 14/18] Polish v0.4 reference documentation

---
 NEWS.md                   |  6 ++++++
 R/mock_spec.R             | 39 ++++++++++++++++++++++++++++++++++++++-
 R/mock_spec_native.R      | 33 ++++++++++++++++++++++++++++++++-
 R/mock_spec_postprocess.R | 35 +++++++++++++++++++++++++++++++----
 R/mock_spec_recodeflow.R  | 38 +++++++++++++++++++++++++++++++++++++-
 R/mock_spec_simstudy.R    | 30 ++++++++++++++++++++++++++++++
 6 files changed, 174 insertions(+), 7 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index f5aa558..9bdd9b5 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -19,9 +19,15 @@
 - Added `postprocess_mock_data()` to apply `mock_spec` missing-code and
   garbage-value rules after baseline generation, with diagnostics that
   distinguish assigned missing/garbage rows from naturally drawn values.
+- Post-processing diagnostics now protect naturally drawn missing-code
+  collisions from later garbage assignment, apply garbage rules in canonical
+  `low` -> `high` -> other order, and stop on repeated post-processing.
 - Added `generate_mock_data_simstudy()` as a soft-gated optional backend for
   baseline categorical and uniform continuous generation when `simstudy` is
   installed, with native generation retained for MockData-specific semantics.
+- The optional `simstudy` backend is kept in `Suggests`, requires
+  `simstudy >= 0.8.1`, and validates categorical labels before converting
+  generated values back into MockData's `mock_spec` levels.
 - Added forward-compatible specification fields: `spec_version`, `provenance`,
   and `model_hint`.
 - Existing v0.3 generator APIs remain available while v0.4 internals are built.
diff --git a/R/mock_spec.R b/R/mock_spec.R
index 7fd3e51..5efd59e 100644
--- a/R/mock_spec.R
+++ b/R/mock_spec.R
@@ -46,10 +46,12 @@ NULL
   if (is.null(x)) y else x
 }
 
+#' @noRd
 .is_non_empty_string <- function(x) {
   is.character(x) && length(x) == 1 && !is.na(x) && nzchar(trimws(x))
 }
 
+#' @noRd
 .normalize_provenance <- function(provenance, source = NULL) {
   if (is.null(provenance)) {
     provenance <- list(adapter = "direct", source = source %||% "direct")
@@ -75,6 +77,7 @@ NULL
   )
 }
 
+#' @noRd
 .validate_model_hint <- function(model_hint) {
   if (length(model_hint) != 1 || is.na(model_hint) || !model_hint %in% .mock_spec_model_hints) {
     stop(
@@ -87,6 +90,7 @@ NULL
   invisible(TRUE)
 }
 
+#' @noRd
 .new_mock_spec_variable <- function(name,
                                     type,
                                     rtype,
@@ -137,6 +141,7 @@ NULL
   )
 }
 
+#' @noRd
 .as_mock_spec_variable_list <- function(...) {
   variables <- list(...)
 
@@ -160,6 +165,7 @@ NULL
   variables
 }
 
+#' @noRd
 .direct_api_provenance <- function(source, provenance = NULL) {
   provenance <- provenance %||% list(source = source)
 
@@ -178,6 +184,14 @@ NULL
 #' new architecture. Direct APIs and recodeflow adapters should both normalize
 #' into this shape before validation and generation.
 #'
+#' @details
+#' The v0.4 API is layered. The `mock_*()` helpers are the simple direct API for
+#' one-variable specifications. The `mock_spec_*()` constructors create variable
+#' specifications that can be composed with `mock_spec()`. Metadata adapters,
+#' such as [mock_spec_from_recodeflow()], translate external metadata into the
+#' same internal shape. Generation backends consume `mock_spec` objects rather
+#' than re-reading user-facing metadata.
+#'
 #' @param ... `mock_spec_variable` objects, or a single list of them. `NULL`
 #'   creates an empty specification.
 #' @param spec_version Character version of the specification shape.
@@ -189,7 +203,9 @@ NULL
 #'
 #' @return S3 object of class `mock_spec`.
 #' @family mock specification APIs
-#' @seealso [mock_continuous()], [mock_categorical()], [mock_date()]
+#' @seealso [mock_continuous()], [mock_categorical()], [mock_date()],
+#'   [mock_spec_from_recodeflow()], [generate_mock_data_native()],
+#'   [postprocess_mock_data()]
 #'
 #' @examples
 #' spec <- mock_spec(
@@ -233,6 +249,11 @@ mock_spec <- function(...,
 #' returns a validated `mock_spec`; it does not generate data. Generation
 #' backends will consume this specification in a later v0.4 milestone.
 #'
+#' @details
+#' Use `mock_continuous()` when specifying one variable directly in R code. Use
+#' [mock_spec_continuous()] with [mock_spec()] when composing several variables
+#' or when writing an adapter from another metadata source.
+#'
 #' @param name Variable name.
 #' @param range Numeric vector of length two giving the inclusive valid range.
 #' @param distribution Distribution name. Defaults to `"uniform"`.
@@ -302,6 +323,11 @@ mock_continuous <- function(name,
 #' `mock_categorical()` is the simple direct API for categorical variables. It
 #' returns a validated `mock_spec`; it does not generate data.
 #'
+#' @details
+#' Use `mock_categorical()` when specifying one variable directly in R code. Use
+#' [mock_spec_categorical()] with [mock_spec()] when composing several variables
+#' or when writing an adapter from another metadata source.
+#'
 #' @param name Variable name.
 #' @param levels Character vector of valid levels or codes.
 #' @param proportions Optional probabilities aligned to `levels`.
@@ -363,6 +389,11 @@ mock_categorical <- function(name,
 #' `mock_date()` is the simple direct API for date variables. It returns a
 #' validated `mock_spec`; it does not generate data.
 #'
+#' @details
+#' Date variables default to `model_hint = "native-postprocess"` because MockData
+#' owns calendar-date generation and source-format conversion. Optional
+#' backends may still generate other variables in the same specification.
+#'
 #' @param name Variable name.
 #' @param range Date vector of length two giving the inclusive valid date range.
 #' @param rtype R output type. Defaults to `"date"`.
@@ -587,6 +618,7 @@ is_mock_spec <- function(x) {
   inherits(x, "mock_spec")
 }
 
+#' @noRd
 .new_mock_spec_validation_result <- function(valid = TRUE,
                                              errors = character(0),
                                              warnings = character(0),
@@ -631,6 +663,7 @@ print.mock_spec_validation_result <- function(x, ...) {
   invisible(x)
 }
 
+#' @noRd
 .validate_probability_vector <- function(values, label, allow_null = FALSE) {
   errors <- character(0)
 
@@ -655,6 +688,7 @@ print.mock_spec_validation_result <- function(x, ...) {
   errors
 }
 
+#' @noRd
 .validate_provenance <- function(provenance, label) {
   errors <- character(0)
 
@@ -671,6 +705,7 @@ print.mock_spec_validation_result <- function(x, ...) {
   errors
 }
 
+#' @noRd
 .validate_missing_spec <- function(variable) {
   errors <- character(0)
 
@@ -702,6 +737,7 @@ print.mock_spec_validation_result <- function(x, ...) {
   errors
 }
 
+#' @noRd
 .validate_range <- function(range, variable_name, expected_class = "numeric") {
   errors <- character(0)
 
@@ -726,6 +762,7 @@ print.mock_spec_validation_result <- function(x, ...) {
   errors
 }
 
+#' @noRd
 .validate_mock_spec_variable <- function(variable) {
   errors <- character(0)
 
diff --git a/R/mock_spec_native.R b/R/mock_spec_native.R
index 990cbf2..53c8286 100644
--- a/R/mock_spec_native.R
+++ b/R/mock_spec_native.R
@@ -5,6 +5,7 @@
 # garbage, diagnostics, and richer rType handling lands in later milestones.
 # ==============================================================================
 
+#' @noRd
 .with_mock_seed <- function(seed, expr) {
   if (is.null(seed)) {
     return(force(expr))
@@ -19,6 +20,7 @@
     old_seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
   }
 
+  # Generation should be reproducible without changing the caller's RNG stream.
   on.exit({
     if (had_seed) {
       assign(".Random.seed", old_seed, envir = .GlobalEnv)
@@ -31,10 +33,12 @@
   force(expr)
 }
 
+#' @noRd
 .empty_native_data <- function(n) {
   data.frame(row.names = seq_len(n))
 }
 
+#' @noRd
 .sample_indices <- function(n_levels, n, prob = NULL) {
   if (n == 0) {
     return(integer(0))
@@ -42,6 +46,7 @@
   sample.int(n_levels, size = n, replace = TRUE, prob = prob)
 }
 
+#' @noRd
 .native_formula_variables <- function(spec) {
   names(Filter(function(variable) {
     formula <- variable$formula
@@ -50,6 +55,7 @@
   }, spec$variables))
 }
 
+#' @noRd
 .check_native_backend_scope <- function(spec) {
   formula_variables <- .native_formula_variables(spec)
   if (length(formula_variables) > 0) {
@@ -65,6 +71,7 @@
   invisible(TRUE)
 }
 
+#' @noRd
 .native_truncated_normal <- function(n, mean, sd, range, variable_name) {
   if (n == 0) {
     return(numeric(0))
@@ -84,6 +91,8 @@
   }
 
   if (length(remaining) > 0) {
+    # Pathological truncation windows can make rejection sampling impractical;
+    # keep generation bounded and tell the caller the tail came from uniform.
     warning(
       "Variable '", variable_name,
       "': could not fill all truncated-normal values by rejection sampling; ",
@@ -96,6 +105,7 @@
   values
 }
 
+#' @noRd
 .coerce_native_continuous <- function(values, rtype, variable_name) {
   if (rtype == "integer") {
     return(as.integer(round(values)))
@@ -111,6 +121,7 @@
   )
 }
 
+#' @noRd
 .coerce_native_categorical <- function(values, levels, rtype, variable_name) {
   if (rtype == "factor") {
     return(factor(values, levels = levels))
@@ -158,6 +169,7 @@
   )
 }
 
+#' @noRd
 .coerce_native_date <- function(values, rtype, variable_name) {
   if (rtype == "date") {
     return(values)
@@ -173,6 +185,7 @@
   )
 }
 
+#' @noRd
 .generate_native_continuous <- function(variable, n) {
   distribution <- tolower(variable$distribution %||% "uniform")
 
@@ -197,6 +210,7 @@
   .coerce_native_continuous(values, variable$rtype, variable$name)
 }
 
+#' @noRd
 .generate_native_categorical <- function(variable, n) {
   levels <- as.character(variable$levels)
   prob <- variable$proportions
@@ -208,6 +222,7 @@
   .coerce_native_categorical(values, levels, variable$rtype, variable$name)
 }
 
+#' @noRd
 .generate_native_date <- function(variable, n) {
   distribution <- tolower(variable$distribution %||% "uniform")
   if (distribution != "uniform") {
@@ -222,6 +237,8 @@
     values <- as.Date(character(0))
   } else {
     range_numeric <- as.integer(variable$range)
+    # Sample day offsets numerically, then restore Date class; this avoids
+    # locale-dependent date parsing during generation.
     offsets <- .sample_indices(
       range_numeric[[2]] - range_numeric[[1]] + 1,
       n
@@ -232,6 +249,7 @@
   .coerce_native_date(values, variable$rtype, variable$name)
 }
 
+#' @noRd
 .generate_native_variable <- function(variable, n) {
   if (variable$type == "continuous") {
     return(.generate_native_continuous(variable, n))
@@ -257,6 +275,18 @@
 #' not yet apply missing-code injection, garbage values, diagnostics, formula
 #' evaluation, or optional `simstudy` features.
 #'
+#' @details
+#' The native backend is the default MIT-licensed baseline engine. It currently
+#' supports uniform continuous variables, truncated-normal continuous variables,
+#' categorical variables, and uniform calendar dates. Missing codes, garbage
+#' values, and diagnostics are intentionally handled by [postprocess_mock_data()]
+#' so that all backends share the same audit trail.
+#'
+#' If `seed` is supplied, the previous R random state is restored after
+#' generation. This gives reproducible output without advancing the caller's RNG
+#' stream. Formula variables are rejected loudly until the formula/dependency
+#' milestone promotes the spike evaluator into production.
+#'
 #' @param spec A `mock_spec` object.
 #' @param n Non-negative whole number of rows to generate.
 #' @param seed Optional whole-number random seed. The previous R random state is
@@ -264,7 +294,8 @@
 #'
 #' @return A data frame with one column per `mock_spec` variable and `n` rows.
 #' @family mock generation APIs
-#' @seealso [mock_spec()], [mock_continuous()], [mock_spec_from_recodeflow()]
+#' @seealso [mock_spec()], [mock_continuous()], [mock_spec_from_recodeflow()],
+#'   [postprocess_mock_data()], [generate_mock_data_simstudy()]
 #'
 #' @examples
 #' spec <- mock_spec(
diff --git a/R/mock_spec_postprocess.R b/R/mock_spec_postprocess.R
index fc1b671..d2751d4 100644
--- a/R/mock_spec_postprocess.R
+++ b/R/mock_spec_postprocess.R
@@ -6,6 +6,7 @@
 # value collisions.
 # ==============================================================================
 
+#' @noRd
 .postprocess_empty_diagnostics <- function(spec, n) {
   variables <- lapply(spec$variables, function(variable) {
     garbage_rule_names <- names(variable$garbage_rules)
@@ -38,6 +39,7 @@
   )
 }
 
+#' @noRd
 .values_match_codes <- function(values, codes) {
   if (length(codes) == 0) {
     return(rep(FALSE, length(values)))
@@ -46,6 +48,7 @@
   as.character(values) %in% as.character(codes)
 }
 
+#' @noRd
 .sample_postprocess_indices <- function(candidates, n, avoid = integer(0)) {
   if (n == 0) {
     return(integer(0))
@@ -65,6 +68,7 @@
   )
 }
 
+#' @noRd
 .coerce_postprocess_values <- function(values, variable, target) {
   if (inherits(target, "factor")) {
     return(as.character(values))
@@ -105,6 +109,7 @@
   as.character(values)
 }
 
+#' @noRd
 .assign_postprocess_values <- function(target, indices, values) {
   if (length(indices) == 0) {
     return(target)
@@ -121,6 +126,7 @@
   target
 }
 
+#' @noRd
 .generate_garbage_for_rule <- function(rule, variable, n) {
   if (n == 0) {
     return(vector(mode = "character", length = 0))
@@ -152,10 +158,14 @@
   values
 }
 
+#' @noRd
 .ordered_garbage_rule_names <- function(rule_names) {
+  # Keep the long-standing garbage convention deterministic: low rules run
+  # before high rules; any future rule names follow in caller order.
   c(intersect(c("low", "high"), rule_names), setdiff(rule_names, c("low", "high")))
 }
 
+#' @noRd
 .postprocess_missing <- function(values, variable, diagnostics) {
   if (length(variable$missing_codes) == 0) {
     return(list(values = values, diagnostics = diagnostics))
@@ -169,6 +179,8 @@
   }
 
   available <- seq_along(values)
+  # Record values that naturally collide with declared missing codes before
+  # assigning any new missing codes; this is the auditability contract.
   preexisting <- which(.values_match_codes(values, variable$missing_codes))
   assigned <- integer(0)
   assigned_codes <- character(0)
@@ -202,6 +214,7 @@
   list(values = values, diagnostics = diagnostics)
 }
 
+#' @noRd
 .postprocess_garbage <- function(values, variable, diagnostics) {
   if (length(variable$garbage_rules) == 0) {
     return(list(values = values, diagnostics = diagnostics))
@@ -225,6 +238,9 @@
     diagnostics$assigned_missing_indices,
     diagnostics$preexisting_missing_code_indices
   )
+  # Garbage must not overwrite either assigned missing rows or naturally drawn
+  # missing-code collisions, otherwise diagnostics would no longer describe the
+  # returned data.
   valid_idx <- setdiff(which(!is.na(values)), protected_idx)
   remaining_idx <- valid_idx
 
@@ -268,6 +284,7 @@
   list(values = values, diagnostics = diagnostics)
 }
 
+#' @noRd
 .postprocess_variable <- function(values, variable, diagnostics) {
   missing_result <- .postprocess_missing(values, variable, diagnostics)
   garbage_result <- .postprocess_garbage(
@@ -297,12 +314,22 @@
 #' @return A data frame with post-processing applied.
 #'
 #' @details
-#' Diagnostics are stored as a data-frame attribute. Base R subsetting and some
-#' downstream tools may drop attributes, so preserve the original post-processed
-#' object when diagnostics are part of the audit trail.
+#' Missing-code diagnostics separate values that were naturally drawn as a
+#' declared missing code (`preexisting_missing_code_indices`) from values that
+#' were assigned by post-processing (`assigned_missing_indices`). Garbage rules
+#' are applied only to rows that are not missing-code diagnostics, preserving the
+#' audit trail for collision cases such as a valid category code that is also a
+#' declared missing code.
+#'
+#' Garbage rules are applied in canonical order: `low`, then `high`, then any
+#' other named rules in caller order. Diagnostics are stored as a data-frame
+#' attribute. Base R subsetting and some downstream tools may drop attributes,
+#' so preserve the original post-processed object when diagnostics are part of
+#' the audit trail.
 #'
 #' @family mock generation APIs
-#' @seealso [generate_mock_data_native()], [mock_spec()]
+#' @seealso [generate_mock_data_native()], [generate_mock_data_simstudy()],
+#'   [mock_spec()]
 #'
 #' @examples
 #' spec <- mock_categorical(
diff --git a/R/mock_spec_recodeflow.R b/R/mock_spec_recodeflow.R
index f713a15..bcf0f52 100644
--- a/R/mock_spec_recodeflow.R
+++ b/R/mock_spec_recodeflow.R
@@ -5,6 +5,7 @@
 # normalized mock_spec representation.
 # ==============================================================================
 
+#' @noRd
 .read_recodeflow_table <- function(x, label) {
   if (is.data.frame(x)) {
     return(x)
@@ -14,6 +15,8 @@
     if (!file.exists(x)) {
       stop(label, " file does not exist: ", x, call. = FALSE)
     }
+    # Pin CSV parsing so path inputs behave like data-frame inputs for the
+    # recodeflow conventions MockData understands.
     return(read.csv(
       x,
       stringsAsFactors = FALSE,
@@ -25,10 +28,12 @@
   stop(label, " must be a data frame or a single CSV path.", call. = FALSE)
 }
 
+#' @noRd
 .is_blank <- function(x) {
   is.null(x) || length(x) == 0 || is.na(x[1]) || trimws(as.character(x[1])) == ""
 }
 
+#' @noRd
 .row_value <- function(row, name, default = NA) {
   if (!name %in% names(row)) {
     return(default)
@@ -42,6 +47,7 @@
   value
 }
 
+#' @noRd
 .row_character <- function(row, name, default = NA_character_) {
   value <- .row_value(row, name, default)
   if (.is_blank(value)) {
@@ -50,6 +56,7 @@
   as.character(value)
 }
 
+#' @noRd
 .row_numeric <- function(row, name, default = NA_real_) {
   value <- .row_value(row, name, default)
   if (.is_blank(value)) {
@@ -69,6 +76,7 @@
   numeric_value
 }
 
+#' @noRd
 .recodeflow_required_columns <- function(data, required, label) {
   missing <- setdiff(required, names(data))
   if (length(missing) > 0) {
@@ -76,6 +84,7 @@
   }
 }
 
+#' @noRd
 .filter_recodeflow_by_database <- function(data, databaseStart, allow_empty = TRUE) {
   if (is.null(databaseStart)) {
     return(data)
@@ -87,9 +96,12 @@
     )
   }
 
+  # databaseStart fields are comma-separated tokens, not substrings; this keeps
+  # cycles such as cchs2017 and cchs2017_2018_p distinct.
   data[.database_start_matches(data$databaseStart, databaseStart, allow_empty = allow_empty), , drop = FALSE]
 }
 
+#' @noRd
 .filter_recodeflow_details <- function(variable_details, variable, databaseStart) {
   if (is.null(variable_details)) {
     return(NULL)
@@ -99,6 +111,7 @@
   .filter_recodeflow_by_database(details, databaseStart, allow_empty = TRUE)
 }
 
+#' @noRd
 .recodeflow_variable_kind <- function(var_row) {
   rtype <- tolower(.row_character(var_row, "rType", ""))
   variable_type <- tolower(.row_character(var_row, "variableType", ""))
@@ -121,6 +134,7 @@
   )
 }
 
+#' @noRd
 .recodeflow_rtype <- function(var_row, kind) {
   rtype <- tolower(.row_character(var_row, "rType", ""))
   if (rtype != "") {
@@ -138,6 +152,7 @@
   )
 }
 
+#' @noRd
 .parse_single_date <- function(value) {
   if (.is_blank(value)) {
     return(NULL)
@@ -151,6 +166,7 @@
   NULL
 }
 
+#' @noRd
 .recodeflow_valid_rows <- function(details) {
   if (is.null(details) || nrow(details) == 0) {
     return(details)
@@ -159,6 +175,8 @@
   rec_start <- as.character(details$recStart)
   rec_end <- if ("recEnd" %in% names(details)) as.character(details$recEnd) else rep("", nrow(details))
 
+  # Functional and derived rows live in recStart; recEnd carries missing-code
+  # semantics such as NA::a / NA::b.
   keep <- !is.na(rec_start) &
     rec_start != "" &
     rec_start != "else" &
@@ -170,6 +188,7 @@
   details[keep, , drop = FALSE]
 }
 
+#' @noRd
 .recodeflow_range <- function(details, variable, kind) {
   valid_rows <- .recodeflow_valid_rows(details)
   if (is.null(valid_rows) || nrow(valid_rows) == 0) {
@@ -199,6 +218,7 @@
   stop("Variable '", variable, "' has no parseable ", kind, " range in recStart.", call. = FALSE)
 }
 
+#' @noRd
 .recodeflow_missing <- function(details) {
   if (is.null(details) || nrow(details) == 0 || !"recEnd" %in% names(details)) {
     return(list(codes = character(0), proportions = numeric(0)))
@@ -223,6 +243,7 @@
   )
 }
 
+#' @noRd
 .recodeflow_distribution <- function(var_row, details) {
   distribution <- tolower(.row_character(var_row, "distribution", ""))
   if (distribution != "") {
@@ -245,6 +266,7 @@
   params$distribution %||% "uniform"
 }
 
+#' @noRd
 .recodeflow_garbage_rules <- function(var_row) {
   rules <- list()
 
@@ -269,6 +291,7 @@
   rules
 }
 
+#' @noRd
 .recodeflow_provenance <- function(variable, databaseStart = NULL) {
   provenance <- list(adapter = "recodeflow", source = variable)
   if (!is.null(databaseStart)) {
@@ -277,6 +300,7 @@
   provenance
 }
 
+#' @noRd
 .recodeflow_to_spec_variable <- function(var_row, details, databaseStart) {
   variable <- .row_character(var_row, "variable")
   kind <- .recodeflow_variable_kind(var_row)
@@ -364,6 +388,12 @@
 #' generated. Set `exclude_derived = FALSE` only when you want those rows to
 #' appear in the adapter input and fail or be handled by later formula support.
 #'
+#' CSV path inputs are read with `stringsAsFactors = FALSE`,
+#' `check.names = FALSE`, and `na.strings = c("", "NA")` so path-based inputs
+#' preserve recodeflow column names and treat blank metadata cells like missing
+#' values. The adapter normalizes `rType = "numeric"` to `"double"` to match
+#' the v0.4 `mock_spec` type vocabulary.
+#'
 #' @param variables Data frame or CSV path for recodeflow-style `variables`
 #'   metadata.
 #' @param variable_details Data frame, CSV path, or `NULL` for recodeflow-style
@@ -380,7 +410,7 @@
 #' @return A validated `mock_spec` object.
 #' @family mock specification APIs
 #' @seealso [mock_spec()], [mock_continuous()], [mock_categorical()],
-#'   [mock_date()]
+#'   [mock_date()], [generate_mock_data_native()], [postprocess_mock_data()]
 #'
 #' @examples
 #' variables <- data.frame(
@@ -399,6 +429,12 @@
 #' spec <- mock_spec_from_recodeflow(variables, details)
 #' validate_mock_spec(spec)
 #'
+#' variables_file <- tempfile(fileext = ".csv")
+#' details_file <- tempfile(fileext = ".csv")
+#' write.csv(variables, variables_file, row.names = FALSE)
+#' write.csv(details, details_file, row.names = FALSE)
+#' spec_from_files <- mock_spec_from_recodeflow(variables_file, details_file)
+#'
 #' @export
 mock_spec_from_recodeflow <- function(variables,
                                       variable_details = NULL,
diff --git a/R/mock_spec_simstudy.R b/R/mock_spec_simstudy.R
index 2535623..a6b08fc 100644
--- a/R/mock_spec_simstudy.R
+++ b/R/mock_spec_simstudy.R
@@ -5,6 +5,7 @@
 # installed. MockData still owns post-processing and diagnostics.
 # ==============================================================================
 
+#' @noRd
 .require_simstudy <- function() {
   if (!requireNamespace("simstudy", quietly = TRUE)) {
     stop(
@@ -24,6 +25,7 @@
   invisible(TRUE)
 }
 
+#' @noRd
 .check_simstudy_variable <- function(variable) {
   if (identical(variable$name, "id")) {
     stop(
@@ -34,6 +36,8 @@
   }
 
   if (variable$type == "categorical" && any(grepl(";", as.character(variable$levels), fixed = TRUE))) {
+    # simstudy encodes categorical probabilities and labels as semicolon-
+    # delimited strings; labels containing ';' cannot round-trip safely.
     stop(
       "Variable '", variable$name,
       "' has categorical level(s) containing ';', which simstudy uses as a delimiter.",
@@ -44,6 +48,7 @@
   invisible(TRUE)
 }
 
+#' @noRd
 .simstudy_definition <- function(def, variable) {
   .check_simstudy_variable(variable)
 
@@ -65,6 +70,8 @@
       probabilities <- rep(1 / length(variable$levels), length(variable$levels))
     }
 
+    # defData() expects categorical probabilities in formula and labels in
+    # variance, both as semicolon-delimited vectors.
     return(simstudy::defData(
       dtDefs = def,
       varname = variable$name,
@@ -81,6 +88,7 @@
   )
 }
 
+#' @noRd
 .normalize_simstudy_categorical <- function(values, variable) {
   value_chr <- as.character(values)
   levels <- as.character(variable$levels)
@@ -88,6 +96,8 @@
     return(value_chr)
   }
 
+  # Older simstudy versions returned 1-based category indices in some paths;
+  # keep the adapter explicit so cross-version drift cannot masquerade as data.
   index <- suppressWarnings(as.integer(value_chr))
   if (!any(is.na(index)) && all(index >= 1 & index <= length(levels))) {
     return(levels[index])
@@ -101,6 +111,7 @@
   )
 }
 
+#' @noRd
 .simstudy_can_generate <- function(variable) {
   if (variable$type == "categorical") {
     return(TRUE)
@@ -110,14 +121,17 @@
     identical(tolower(variable$distribution %||% "uniform"), "uniform")
 }
 
+#' @noRd
 .simstudy_variables <- function(spec) {
   Filter(.simstudy_can_generate, spec$variables)
 }
 
+#' @noRd
 .native_only_variables <- function(spec) {
   Filter(function(variable) !.simstudy_can_generate(variable), spec$variables)
 }
 
+#' @noRd
 .generate_simstudy_baseline <- function(variables, n) {
   if (length(variables) == 0) {
     return(.empty_native_data(n))
@@ -152,6 +166,7 @@
   generated
 }
 
+#' @noRd
 .generate_native_only_baseline <- function(variables, n) {
   if (length(variables) == 0) {
     return(.empty_native_data(n))
@@ -174,6 +189,19 @@
 #' truncated normal ranges and calendar dates, are generated by MockData's native
 #' path inside the same seeded call.
 #'
+#' @details
+#' `simstudy` is an optional `Suggests` dependency and is GPL-3 licensed;
+#' MockData remains MIT licensed by keeping this backend soft-gated and by
+#' retaining [generate_mock_data_native()] as the default engine. Use this
+#' backend when `simstudy` is installed and you want to exercise the optional
+#' engine path. It currently delegates only categorical and uniform continuous
+#' baseline generation to `simstudy`; unsupported variables are routed through
+#' MockData's native backend so that a single specification can mix capabilities.
+#'
+#' Missing-code assignment, garbage values, and diagnostics are not delegated to
+#' `simstudy`. They remain MockData-owned post-processing so both backends share
+#' the same auditability contract.
+#'
 #' @param spec A `mock_spec` object.
 #' @param n Non-negative whole number of rows to generate.
 #' @param seed Optional whole-number random seed. The previous R random state is
@@ -202,6 +230,8 @@ generate_mock_data_simstudy <- function(spec, n, seed = NULL) {
 
   .with_mock_seed(seed, {
     simstudy_data <- .generate_simstudy_baseline(simstudy_variables, n)
+    # Normal/date variables stay native until MockData has an explicit contract
+    # for mapping their range and calendar semantics into simstudy definitions.
     native_data <- .generate_native_only_baseline(native_only_variables, n)
 
     columns <- c(simstudy_data, native_data)

From 3d448df827331d9e201860e0ee6cf54665b08b43 Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Wed, 20 May 2026 06:26:10 -0400
Subject: [PATCH 15/18] Clarify v0.4 postprocess and simstudy notes

---
 NEWS.md                   |  7 ++++++-
 R/mock_spec_postprocess.R | 11 +++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 9bdd9b5..3fcdd78 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -21,13 +21,18 @@
   distinguish assigned missing/garbage rows from naturally drawn values.
 - Post-processing diagnostics now protect naturally drawn missing-code
   collisions from later garbage assignment, apply garbage rules in canonical
-  `low` -> `high` -> other order, and stop on repeated post-processing.
+  `low` -> `high` -> other order, and stop on repeated post-processing. This
+  prevents silent diagnostic drift when a naturally drawn missing-code value
+  would otherwise be overwritten by garbage assignment.
 - Added `generate_mock_data_simstudy()` as a soft-gated optional backend for
   baseline categorical and uniform continuous generation when `simstudy` is
   installed, with native generation retained for MockData-specific semantics.
 - The optional `simstudy` backend is kept in `Suggests`, requires
   `simstudy >= 0.8.1`, and validates categorical labels before converting
   generated values back into MockData's `mock_spec` levels.
+- The optional `simstudy` backend now rejects variables named `id`, which
+  conflicts with `simstudy`'s generated row identifier, and normalizes
+  categorical output through an explicit label-or-index validation path.
 - Added forward-compatible specification fields: `spec_version`, `provenance`,
   and `model_hint`.
 - Existing v0.3 generator APIs remain available while v0.4 internals are built.
diff --git a/R/mock_spec_postprocess.R b/R/mock_spec_postprocess.R
index d2751d4..8b394f2 100644
--- a/R/mock_spec_postprocess.R
+++ b/R/mock_spec_postprocess.R
@@ -322,10 +322,13 @@
 #' declared missing code.
 #'
 #' Garbage rules are applied in canonical order: `low`, then `high`, then any
-#' other named rules in caller order. Diagnostics are stored as a data-frame
-#' attribute. Base R subsetting and some downstream tools may drop attributes,
-#' so preserve the original post-processed object when diagnostics are part of
-#' the audit trail.
+#' other named rules in caller order. Each garbage rule is a named list with a
+#' `proportion` field and a `range` field using MockData range notation, for
+#' example `list(high = list(proportion = 0.05, range = "[150, 200]"))`.
+#'
+#' Diagnostics are stored as a data-frame attribute. Base R subsetting and some
+#' downstream tools may drop attributes, so preserve the original post-processed
+#' object when diagnostics are part of the audit trail.
 #'
 #' @family mock generation APIs
 #' @seealso [generate_mock_data_native()], [generate_mock_data_simstudy()],

From 9fe335e7cd3e9fd7ebe4732a984ef47619f324b7 Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Wed, 20 May 2026 06:33:18 -0400
Subject: [PATCH 16/18] Route create_mock_data through mock_spec pipeline

---
 NEWS.md                                    |   3 +
 R/create_mock_data.R                       | 109 ++++++++++++++++++++-
 tests/testthat/test-create-mock-data-v04.R | 105 ++++++++++++++++++++
 3 files changed, 215 insertions(+), 2 deletions(-)
 create mode 100644 tests/testthat/test-create-mock-data-v04.R

diff --git a/NEWS.md b/NEWS.md
index 3fcdd78..c92e2a5 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -33,6 +33,9 @@
 - The optional `simstudy` backend now rejects variables named `id`, which
   conflicts with `simstudy`'s generated row identifier, and normalizes
   categorical output through an explicit label-or-index validation path.
+- `create_mock_data()` now attempts the v0.4 `mock_spec` pipeline in strict
+  mode for supported recodeflow metadata, while retaining the legacy `create_*`
+  dispatch path for unsupported v0.4 backend features and lenient generation.
 - Added forward-compatible specification fields: `spec_version`, `provenance`,
   and `model_hint`.
 - Existing v0.3 generator APIs remain available while v0.4 internals are built.
diff --git a/R/create_mock_data.R b/R/create_mock_data.R
index 2f5494c..158dc41 100644
--- a/R/create_mock_data.R
+++ b/R/create_mock_data.R
@@ -1,3 +1,83 @@
+#' @noRd
+.create_mock_data_v04_database_filter <- function(variables, databaseStart) {
+  if (!is.null(databaseStart) && "databaseStart" %in% names(variables)) {
+    return(databaseStart)
+  }
+
+  NULL
+}
+
+#' @noRd
+.create_mock_data_v04_native_supported <- function(spec) {
+  all(vapply(spec$variables, function(variable) {
+    formula <- variable$formula
+    has_formula <- !is.null(formula) &&
+      !(is.character(formula) && length(formula) == 1 && (is.na(formula) || trimws(formula) == ""))
+    if (has_formula) {
+      return(FALSE)
+    }
+
+    distribution <- tolower(variable$distribution %||% "uniform")
+    if (variable$type == "continuous") {
+      return(distribution %in% c("uniform", "normal"))
+    }
+    if (variable$type == "categorical") {
+      return(TRUE)
+    }
+    if (variable$type == "date") {
+      return(distribution == "uniform" && identical(variable$source_format %||% "analysis", "analysis"))
+    }
+
+    FALSE
+  }, logical(1)))
+}
+
+#' @noRd
+.create_mock_data_v04 <- function(databaseStart,
+                                  variables,
+                                  variable_details,
+                                  n,
+                                  seed,
+                                  verbose = FALSE) {
+  if (!is.null(databaseStart) &&
+      !"databaseStart" %in% names(variables) &&
+      "databaseStart" %in% names(variable_details)) {
+    if (isTRUE(verbose)) {
+      message(
+        "v0.4 mock_spec pipeline requires variable-level databaseStart when ",
+        "detail-level databaseStart filtering is needed; using legacy ",
+        "create_* dispatch."
+      )
+    }
+    return(NULL)
+  }
+
+  spec <- mock_spec_from_recodeflow(
+    variables = variables,
+    variable_details = variable_details,
+    databaseStart = .create_mock_data_v04_database_filter(variables, databaseStart),
+    role = "enabled"
+  )
+
+  if (!.create_mock_data_v04_native_supported(spec)) {
+    if (isTRUE(verbose)) {
+      message(
+        "v0.4 mock_spec pipeline does not yet support every requested ",
+        "variable; using legacy create_* dispatch."
+      )
+    }
+    return(NULL)
+  }
+
+  if (isTRUE(verbose)) {
+    message("Generating via v0.4 mock_spec pipeline.")
+  }
+
+  baseline <- generate_mock_data_native(spec, n = n, seed = seed)
+  postprocess_seed <- if (is.null(seed)) NULL else seed + 1L
+  postprocess_mock_data(baseline, spec, seed = postprocess_seed)
+}
+
 #' Create mock data from configuration files
 #'
 #' @description
@@ -38,8 +118,16 @@
 #' @return Data frame with n rows and one column per enabled variable.
 #'
 #' @details
-#' **v0.3.0 API**: This function now follows the "recodeflow pattern" where it passes
-#' full metadata data frames to create_* functions, which handle internal filtering.
+#' **v0.4.0 transition**: In strict mode, this function first attempts to use
+#' the v0.4 `mock_spec` pipeline: [mock_spec_from_recodeflow()],
+#' [generate_mock_data_native()], and [postprocess_mock_data()]. If the metadata
+#' requests a feature not yet supported by the v0.4 native backend, it falls
+#' back to the v0.3 `create_*` dispatch path so existing users can migrate
+#' gradually.
+#'
+#' **v0.3.0 API**: This function follows the "recodeflow pattern" where it passes
+#' full metadata data frames to create_* functions, which handle internal
+#' filtering.
 #'
 #' **Generation process**:
 #' \enumerate{
@@ -155,6 +243,23 @@ create_mock_data <- function(databaseStart,
     stop("variables must have a 'variableType' column")
   }
 
+  # ========== v0.4 PIPELINE PATH ==========
+
+  if (isTRUE(validate) && !is.null(variable_details)) {
+    v04_result <- .create_mock_data_v04(
+      databaseStart = databaseStart,
+      variables = variables,
+      variable_details = variable_details,
+      n = n,
+      seed = seed,
+      verbose = verbose
+    )
+
+    if (!is.null(v04_result)) {
+      return(v04_result)
+    }
+  }
+
   # ========== FILTER FOR ENABLED VARIABLES ==========
 
   if (verbose) message("Filtering for enabled variables...")
diff --git a/tests/testthat/test-create-mock-data-v04.R b/tests/testthat/test-create-mock-data-v04.R
new file mode 100644
index 0000000..916a82b
--- /dev/null
+++ b/tests/testthat/test-create-mock-data-v04.R
@@ -0,0 +1,105 @@
+test_that("create_mock_data uses the v0.4 pipeline for strict supported metadata", {
+  variables <- data.frame(
+    variable = "smoking",
+    variableType = "Categorical",
+    rType = "character",
+    role = "enabled",
+    garbage_low_prop = 0.1,
+    garbage_low_range = "[-2, 0]",
+    stringsAsFactors = FALSE
+  )
+  variable_details <- data.frame(
+    variable = "smoking",
+    recStart = c("1", "2", "97"),
+    recEnd = c("copy", "copy", "NA::b"),
+    proportion = c(0.6, 0.3, 0.1),
+    stringsAsFactors = FALSE
+  )
+
+  result <- create_mock_data(
+    databaseStart = "study",
+    variables = variables,
+    variable_details = variable_details,
+    n = 100,
+    seed = 101
+  )
+  diagnostics <- attr(result, "mockdata_diagnostics")$variables$smoking
+
+  expect_equal(names(result), "smoking")
+  expect_true(all(result$smoking %in% c("1", "2", "97", "-2", "-1", "0")))
+  expect_equal(length(diagnostics$assigned_missing_indices), 10)
+  expect_equal(length(diagnostics$assigned_garbage_indices$low), 9)
+  expect_length(intersect(
+    diagnostics$assigned_missing_indices,
+    diagnostics$assigned_garbage_indices$low
+  ), 0)
+})
+
+test_that("create_mock_data keeps legacy fallback for unsupported v0.4 backend features", {
+  variables <- data.frame(
+    variable = "time_to_visit",
+    variableType = "Continuous",
+    rType = "double",
+    role = "enabled",
+    distribution = "exponential",
+    rate = 0.5,
+    stringsAsFactors = FALSE
+  )
+  variable_details <- data.frame(
+    variable = "time_to_visit",
+    recStart = "[0, 10]",
+    recEnd = "copy",
+    proportion = 1,
+    stringsAsFactors = FALSE
+  )
+
+  expect_message(
+    result <- create_mock_data(
+      databaseStart = "study",
+      variables = variables,
+      variable_details = variable_details,
+      n = 50,
+      seed = 202,
+      verbose = TRUE
+    ),
+    "legacy create_\\* dispatch"
+  )
+
+  expect_equal(names(result), "time_to_visit")
+  expect_equal(nrow(result), 50)
+  expect_true(is.numeric(result$time_to_visit))
+  expect_null(attr(result, "mockdata_diagnostics"))
+})
+
+test_that("create_mock_data keeps legacy detail-level databaseStart filtering", {
+  variables <- data.frame(
+    variable = "smoking",
+    variableType = "Categorical",
+    rType = "character",
+    role = "enabled",
+    stringsAsFactors = FALSE
+  )
+  variable_details <- data.frame(
+    variable = c("smoking", "smoking"),
+    recStart = c("1", "2"),
+    recEnd = c("copy", "copy"),
+    proportion = c(1, 1),
+    databaseStart = c("cycle1", "cycle10"),
+    stringsAsFactors = FALSE
+  )
+
+  expect_message(
+    result <- create_mock_data(
+      databaseStart = "cycle1",
+      variables = variables,
+      variable_details = variable_details,
+      n = 10,
+      seed = 303,
+      verbose = TRUE
+    ),
+    "detail-level databaseStart filtering"
+  )
+
+  expect_equal(unique(result$smoking), "1")
+  expect_null(attr(result, "mockdata_diagnostics"))
+})

From ca00377c84d5035e176743a830f9cfb3a1a8a63e Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Wed, 20 May 2026 07:16:49 -0400
Subject: [PATCH 17/18] Document and test create_mock_data v04 routing

---
 NEWS.md                                    |   4 +
 R/create_mock_data.R                       |  60 ++++++++---
 tests/testthat/test-create-mock-data-v04.R | 111 ++++++++++++++++++++-
 3 files changed, 160 insertions(+), 15 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index c92e2a5..434a5e8 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -36,6 +36,10 @@
 - `create_mock_data()` now attempts the v0.4 `mock_spec` pipeline in strict
   mode for supported recodeflow metadata, while retaining the legacy `create_*`
   dispatch path for unsupported v0.4 backend features and lenient generation.
+  The v0.4 path attaches `mockdata_diagnostics` and uses `seed` for baseline
+  generation plus `seed + 1` for post-processing, so exact seeded output may
+  differ from v0.3.x even when the public seed is unchanged. Verbose mode now
+  reports whether the v0.4 or legacy path was chosen.
 - Added forward-compatible specification fields: `spec_version`, `provenance`,
   and `model_hint`.
 - Existing v0.3 generator APIs remain available while v0.4 internals are built.
diff --git a/R/create_mock_data.R b/R/create_mock_data.R
index 158dc41..2ba23ec 100644
--- a/R/create_mock_data.R
+++ b/R/create_mock_data.R
@@ -8,28 +8,35 @@
 }
 
 #' @noRd
-.create_mock_data_v04_native_supported <- function(spec) {
-  all(vapply(spec$variables, function(variable) {
+.create_mock_data_v04_unsupported_variables <- function(spec) {
+  unsupported <- vapply(spec$variables, function(variable) {
     formula <- variable$formula
     has_formula <- !is.null(formula) &&
       !(is.character(formula) && length(formula) == 1 && (is.na(formula) || trimws(formula) == ""))
     if (has_formula) {
-      return(FALSE)
+      return(TRUE)
     }
 
     distribution <- tolower(variable$distribution %||% "uniform")
     if (variable$type == "continuous") {
-      return(distribution %in% c("uniform", "normal"))
+      return(!distribution %in% c("uniform", "normal"))
     }
     if (variable$type == "categorical") {
-      return(TRUE)
+      return(FALSE)
     }
     if (variable$type == "date") {
-      return(distribution == "uniform" && identical(variable$source_format %||% "analysis", "analysis"))
+      return(!(distribution == "uniform" && identical(variable$source_format %||% "analysis", "analysis")))
     }
 
-    FALSE
-  }, logical(1)))
+    TRUE
+  }, logical(1))
+
+  names(spec$variables)[unsupported]
+}
+
+#' @noRd
+.create_mock_data_v04_native_supported <- function(spec) {
+  length(.create_mock_data_v04_unsupported_variables(spec)) == 0
 }
 
 #' @noRd
@@ -59,11 +66,13 @@
     role = "enabled"
   )
 
-  if (!.create_mock_data_v04_native_supported(spec)) {
+  unsupported <- .create_mock_data_v04_unsupported_variables(spec)
+  if (length(unsupported) > 0) {
     if (isTRUE(verbose)) {
       message(
         "v0.4 mock_spec pipeline does not yet support every requested ",
-        "variable; using legacy create_* dispatch."
+        "variable; using legacy create_* dispatch. Unsupported variable(s): ",
+        paste(unsupported, collapse = ", ")
       )
     }
     return(NULL)
@@ -74,6 +83,9 @@
   }
 
   baseline <- generate_mock_data_native(spec, n = n, seed = seed)
+  # The wrapper uses a second deterministic stream for post-processing so
+  # baseline generation and missing/garbage assignment can be reproduced
+  # independently from the single public seed.
   postprocess_seed <- if (is.null(seed)) NULL else seed + 1L
   postprocess_mock_data(baseline, spec, seed = postprocess_seed)
 }
@@ -115,7 +127,10 @@
 #'   affected variable is skipped.
 #' @param verbose Logical. Whether to print progress messages (default FALSE).
 #'
-#' @return Data frame with n rows and one column per enabled variable.
+#' @return Data frame with n rows and one column per enabled variable. When the
+#'   v0.4 `mock_spec` path is used, the result also carries a
+#'   `mockdata_diagnostics` attribute from [postprocess_mock_data()]. Legacy
+#'   fallback paths return plain data frames without that attribute.
 #'
 #' @details
 #' **v0.4.0 transition**: In strict mode, this function first attempts to use
@@ -125,6 +140,16 @@
 #' back to the v0.3 `create_*` dispatch path so existing users can migrate
 #' gradually.
 #'
+#' The wrapper deliberately stays on the legacy path when `validate = FALSE`,
+#' when `variable_details = NULL`, when detail-level `databaseStart` filtering is
+#' needed but the variables metadata has no `databaseStart` column, or when a
+#' variable uses a feature not yet supported by the v0.4 native backend. Set
+#' `verbose = TRUE` to see which path was chosen.
+#'
+#' In the v0.4 path, `seed` is used for baseline generation and `seed + 1` is
+#' used for post-processing. This makes both stages deterministic, but generated
+#' values may differ from v0.3.x output for the same seed.
+#'
 #' **v0.3.0 API**: This function follows the "recodeflow pattern" where it passes
 #' full metadata data frames to create_* functions, which handle internal
 #' filtering.
@@ -194,6 +219,9 @@
 #' }
 #'
 #' @family generators
+#' @family mock generation APIs
+#' @seealso [mock_spec_from_recodeflow()], [generate_mock_data_native()],
+#'   [postprocess_mock_data()], [generate_mock_data_simstudy()], [mock_spec()]
 #' @export
 create_mock_data <- function(databaseStart,
                              variables,
@@ -245,7 +273,15 @@ create_mock_data <- function(databaseStart,
 
   # ========== v0.4 PIPELINE PATH ==========
 
-  if (isTRUE(validate) && !is.null(variable_details)) {
+  if (!isTRUE(validate)) {
+    if (verbose) {
+      message("validate = FALSE requested; using legacy create_* dispatch.")
+    }
+  } else if (is.null(variable_details)) {
+    if (verbose) {
+      message("variable_details = NULL; using legacy create_* fallback dispatch.")
+    }
+  } else {
     v04_result <- .create_mock_data_v04(
       databaseStart = databaseStart,
       variables = variables,
diff --git a/tests/testthat/test-create-mock-data-v04.R b/tests/testthat/test-create-mock-data-v04.R
index 916a82b..781ffde 100644
--- a/tests/testthat/test-create-mock-data-v04.R
+++ b/tests/testthat/test-create-mock-data-v04.R
@@ -27,8 +27,10 @@ test_that("create_mock_data uses the v0.4 pipeline for strict supported metadata
 
   expect_equal(names(result), "smoking")
   expect_true(all(result$smoking %in% c("1", "2", "97", "-2", "-1", "0")))
-  expect_equal(length(diagnostics$assigned_missing_indices), 10)
-  expect_equal(length(diagnostics$assigned_garbage_indices$low), 9)
+  expect_true(length(diagnostics$assigned_missing_indices) >= 8)
+  expect_true(length(diagnostics$assigned_missing_indices) <= 12)
+  expect_true(length(diagnostics$assigned_garbage_indices$low) >= 7)
+  expect_true(length(diagnostics$assigned_garbage_indices$low) <= 11)
   expect_length(intersect(
     diagnostics$assigned_missing_indices,
     diagnostics$assigned_garbage_indices$low
@@ -62,7 +64,7 @@ test_that("create_mock_data keeps legacy fallback for unsupported v0.4 backend f
       seed = 202,
       verbose = TRUE
     ),
-    "legacy create_\\* dispatch"
+    "Unsupported variable\\(s\\): time_to_visit"
   )
 
   expect_equal(names(result), "time_to_visit")
@@ -103,3 +105,106 @@ test_that("create_mock_data keeps legacy detail-level databaseStart filtering",
   expect_equal(unique(result$smoking), "1")
   expect_null(attr(result, "mockdata_diagnostics"))
 })
+
+test_that("create_mock_data announces validate FALSE legacy path", {
+  variables <- data.frame(
+    variable = "smoking",
+    variableType = "Categorical",
+    rType = "character",
+    role = "enabled",
+    stringsAsFactors = FALSE
+  )
+  variable_details <- data.frame(
+    variable = "smoking",
+    recStart = c("1", "2"),
+    recEnd = c("copy", "copy"),
+    proportion = c(0.6, 0.4),
+    stringsAsFactors = FALSE
+  )
+
+  expect_message(
+    result <- create_mock_data(
+      databaseStart = "study",
+      variables = variables,
+      variable_details = variable_details,
+      n = 20,
+      seed = 404,
+      validate = FALSE,
+      verbose = TRUE
+    ),
+    "validate = FALSE requested"
+  )
+
+  expect_equal(names(result), "smoking")
+  expect_null(attr(result, "mockdata_diagnostics"))
+})
+
+test_that("create_mock_data announces variable_details NULL legacy fallback path", {
+  variables <- data.frame(
+    variable = "age",
+    variableType = "Continuous",
+    rType = "integer",
+    role = "enabled",
+    stringsAsFactors = FALSE
+  )
+
+  expect_warning(
+    expect_message(
+      result <- create_mock_data(
+        databaseStart = "study",
+        variables = variables,
+        variable_details = NULL,
+        n = 20,
+        seed = 505,
+        verbose = TRUE
+      ),
+      "variable_details = NULL"
+    ),
+    "No variable_details rows found"
+  )
+
+  expect_equal(names(result), "age")
+  expect_null(attr(result, "mockdata_diagnostics"))
+})
+
+test_that("create_mock_data v0.4 and legacy paths are distributionally aligned", {
+  variables <- data.frame(
+    variable = "smoking",
+    variableType = "Categorical",
+    rType = "character",
+    role = "enabled",
+    stringsAsFactors = FALSE
+  )
+  variable_details <- data.frame(
+    variable = "smoking",
+    recStart = c("1", "2"),
+    recEnd = c("copy", "copy"),
+    proportion = c(0.65, 0.35),
+    stringsAsFactors = FALSE
+  )
+
+  v04 <- create_mock_data(
+    databaseStart = "study",
+    variables = variables,
+    variable_details = variable_details,
+    n = 5000,
+    seed = 606
+  )
+  legacy <- create_mock_data(
+    databaseStart = "study",
+    variables = variables,
+    variable_details = variable_details,
+    n = 5000,
+    seed = 606,
+    validate = FALSE
+  )
+
+  expect_equal(sort(unique(v04$smoking)), sort(unique(legacy$smoking)))
+  expect_equal(
+    unname(prop.table(table(v04$smoking))[c("1", "2")]),
+    unname(prop.table(table(legacy$smoking))[c("1", "2")]),
+    tolerance = 0.03
+  )
+  expect_type(attr(v04, "mockdata_diagnostics"), "list")
+  expect_null(attr(legacy, "mockdata_diagnostics"))
+})

From 54047316b57c765fa8f79eafc9120041b1a948bf Mon Sep 17 00:00:00 2001
From: Doug Manuel <DougManuel@users.noreply.github.com>
Date: Wed, 20 May 2026 08:57:07 -0400
Subject: [PATCH 18/18] Add v04 functions to pkgdown reference index

---
 _pkgdown.yml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/_pkgdown.yml b/_pkgdown.yml
index 5347b43..d17d53a 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -13,6 +13,32 @@ footer:
     developed_by: <p>Developed by Juan Li, Douglas Manuel, and recodeflow contributors.</p>
 
 reference:
+- title: v0.4 specification API
+  desc: >
+    Build and validate normalized `mock_spec` objects using direct helpers or
+    composable variable constructors.
+  contents:
+  - mock_spec
+  - mock_continuous
+  - mock_categorical
+  - mock_date
+  - mock_spec_continuous
+  - mock_spec_categorical
+  - mock_spec_date
+  - is_mock_spec
+  - validate_mock_spec
+  - mock_spec_model_hints
+
+- title: v0.4 adapters and generation pipeline
+  desc: >
+    Convert recodeflow metadata into `mock_spec` objects, generate baseline data,
+    and apply post-processing diagnostics.
+  contents:
+  - mock_spec_from_recodeflow
+  - generate_mock_data_native
+  - generate_mock_data_simstudy
+  - postprocess_mock_data
+
 - title: Main generation functions
   desc: >
     Generate categorical, continuous, date, and survival variables. Use `create_mock_data()`