Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Suggests:
readr,
stringr,
lubridate,
simstudy (>= 0.8.1),
knitr,
quarto,
devtools,
Expand Down
14 changes: 14 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

S3method(print,mock_spec_validation_result)
S3method(print,mockdata_validation_result)
export(add_garbage)
export(apply_garbage)
Expand All @@ -14,6 +15,8 @@ export(create_wide_survival_data)
export(extract_distribution_params)
export(extract_proportions)
export(generate_garbage_values)
export(generate_mock_data_native)
export(generate_mock_data_simstudy)
export(get_cycle_variables)
export(get_enabled_variables)
export(get_raw_var_dependencies)
Expand All @@ -23,14 +26,25 @@ export(get_variables_by_role)
export(has_garbage)
export(identify_derived_vars)
export(import_from_recodeflow)
export(is_mock_spec)
export(make_garbage)
export(mock_categorical)
export(mock_continuous)
export(mock_date)
export(mock_spec)
export(mock_spec_categorical)
export(mock_spec_continuous)
export(mock_spec_date)
export(mock_spec_from_recodeflow)
export(parse_range_notation)
export(parse_variable_start)
export(postprocess_mock_data)
export(read_mock_data_config)
export(read_mock_data_config_details)
export(sample_with_proportions)
export(validate_mock_data_config)
export(validate_mock_data_config_details)
export(validate_mock_spec)
export(validate_mockdata_metadata)
importFrom(stats,rexp)
importFrom(stats,rnorm)
Expand Down
46 changes: 46 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,49 @@
# MockData 0.4.0

## Development

- Started the v0.4 production refactor around a normalized `mock_spec`
architecture.
- Added `mock_spec()`, `mock_spec_continuous()`, `mock_spec_categorical()`,
`mock_spec_date()`, `is_mock_spec()`, and `validate_mock_spec()`.
- Added direct specification helpers `mock_continuous()`,
`mock_categorical()`, and `mock_date()` for simple use without
recodeflow-style metadata tables.
- Added `mock_spec_from_recodeflow()` to adapt recodeflow-style `variables`
and `variable_details` metadata into validated `mock_spec` objects while
preserving role/database filtering, categorical proportions, `recEnd`
missing-code semantics, valid ranges, garbage rules, date ranges, and
survival/date fields.
- Added `generate_mock_data_native()` to generate baseline valid mock data from
`mock_spec` objects with the native R backend.
- Added `postprocess_mock_data()` to apply `mock_spec` missing-code and
garbage-value rules after baseline generation, with diagnostics that
distinguish assigned missing/garbage rows from naturally drawn values.
- Post-processing diagnostics now protect naturally drawn missing-code
collisions from later garbage assignment, apply garbage rules in canonical
`low` -> `high` -> other order, and stop on repeated post-processing. This
prevents silent diagnostic drift when a naturally drawn missing-code value
would otherwise be overwritten by garbage assignment.
- Added `generate_mock_data_simstudy()` as a soft-gated optional backend for
baseline categorical and uniform continuous generation when `simstudy` is
installed, with native generation retained for MockData-specific semantics.
- The optional `simstudy` backend is kept in `Suggests`, requires
`simstudy >= 0.8.1`, and validates categorical labels before converting
generated values back into MockData's `mock_spec` levels.
- The optional `simstudy` backend now rejects variables named `id`, which
conflicts with `simstudy`'s generated row identifier, and normalizes
categorical output through an explicit label-or-index validation path.
- `create_mock_data()` now attempts the v0.4 `mock_spec` pipeline in strict
mode for supported recodeflow metadata, while retaining the legacy `create_*`
dispatch path for unsupported v0.4 backend features and lenient generation.
The v0.4 path attaches `mockdata_diagnostics` and uses `seed` for baseline
generation plus `seed + 1` for post-processing, so exact seeded output may
differ from v0.3.x even when the public seed is unchanged. Verbose mode now
reports whether the v0.4 or legacy path was chosen.
- Added forward-compatible specification fields: `spec_version`, `provenance`,
and `model_hint`.
- Existing v0.3 generator APIs remain available while v0.4 internals are built.

# MockData 0.3.0

## Breaking changes
Expand Down
147 changes: 144 additions & 3 deletions R/create_mock_data.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,95 @@
#' @noRd
.create_mock_data_v04_database_filter <- function(variables, databaseStart) {
if (!is.null(databaseStart) && "databaseStart" %in% names(variables)) {
return(databaseStart)
}

NULL
}

#' @noRd
.create_mock_data_v04_unsupported_variables <- function(spec) {
unsupported <- vapply(spec$variables, function(variable) {
formula <- variable$formula
has_formula <- !is.null(formula) &&
!(is.character(formula) && length(formula) == 1 && (is.na(formula) || trimws(formula) == ""))
if (has_formula) {
return(TRUE)
}

distribution <- tolower(variable$distribution %||% "uniform")
if (variable$type == "continuous") {
return(!distribution %in% c("uniform", "normal"))
}
if (variable$type == "categorical") {
return(FALSE)
}
if (variable$type == "date") {
return(!(distribution == "uniform" && identical(variable$source_format %||% "analysis", "analysis")))
}

TRUE
}, logical(1))

names(spec$variables)[unsupported]
}

#' @noRd
.create_mock_data_v04_native_supported <- function(spec) {
length(.create_mock_data_v04_unsupported_variables(spec)) == 0
}

#' @noRd
.create_mock_data_v04 <- function(databaseStart,
variables,
variable_details,
n,
seed,
verbose = FALSE) {
if (!is.null(databaseStart) &&
!"databaseStart" %in% names(variables) &&
"databaseStart" %in% names(variable_details)) {
if (isTRUE(verbose)) {
message(
"v0.4 mock_spec pipeline requires variable-level databaseStart when ",
"detail-level databaseStart filtering is needed; using legacy ",
"create_* dispatch."
)
}
return(NULL)
}

spec <- mock_spec_from_recodeflow(
variables = variables,
variable_details = variable_details,
databaseStart = .create_mock_data_v04_database_filter(variables, databaseStart),
role = "enabled"
)

unsupported <- .create_mock_data_v04_unsupported_variables(spec)
if (length(unsupported) > 0) {
if (isTRUE(verbose)) {
message(
"v0.4 mock_spec pipeline does not yet support every requested ",
"variable; using legacy create_* dispatch. Unsupported variable(s): ",
paste(unsupported, collapse = ", ")
)
}
return(NULL)
}

if (isTRUE(verbose)) {
message("Generating via v0.4 mock_spec pipeline.")
}

baseline <- generate_mock_data_native(spec, n = n, seed = seed)
# The wrapper uses a second deterministic stream for post-processing so
# baseline generation and missing/garbage assignment can be reproduced
# independently from the single public seed.
postprocess_seed <- if (is.null(seed)) NULL else seed + 1L
postprocess_mock_data(baseline, spec, seed = postprocess_seed)
}

#' Create mock data from configuration files
#'
#' @description
Expand Down Expand Up @@ -35,11 +127,32 @@
#' affected variable is skipped.
#' @param verbose Logical. Whether to print progress messages (default FALSE).
#'
#' @return Data frame with n rows and one column per enabled variable.
#' @return Data frame with n rows and one column per enabled variable. When the
#' v0.4 `mock_spec` path is used, the result also carries a
#' `mockdata_diagnostics` attribute from [postprocess_mock_data()]. Legacy
#' fallback paths return plain data frames without that attribute.
#'
#' @details
#' **v0.3.0 API**: This function now follows the "recodeflow pattern" where it passes
#' full metadata data frames to create_* functions, which handle internal filtering.
#' **v0.4.0 transition**: In strict mode, this function first attempts to use
#' the v0.4 `mock_spec` pipeline: [mock_spec_from_recodeflow()],
#' [generate_mock_data_native()], and [postprocess_mock_data()]. If the metadata
#' requests a feature not yet supported by the v0.4 native backend, it falls
#' back to the v0.3 `create_*` dispatch path so existing users can migrate
#' gradually.
#'
#' The wrapper deliberately stays on the legacy path when `validate = FALSE`,
#' when `variable_details = NULL`, when detail-level `databaseStart` filtering is
#' needed but the variables metadata has no `databaseStart` column, or when a
#' variable uses a feature not yet supported by the v0.4 native backend. Set
#' `verbose = TRUE` to see which path was chosen.
#'
#' In the v0.4 path, `seed` is used for baseline generation and `seed + 1` is
#' used for post-processing. This makes both stages deterministic, but generated
#' values may differ from v0.3.x output for the same seed.
#'
#' **v0.3.0 API**: This function follows the "recodeflow pattern" where it passes
#' full metadata data frames to create_* functions, which handle internal
#' filtering.
#'
#' **Generation process**:
#' \enumerate{
Expand Down Expand Up @@ -106,6 +219,9 @@
#' }
#'
#' @family generators
#' @family mock generation APIs
#' @seealso [mock_spec_from_recodeflow()], [generate_mock_data_native()],
#' [postprocess_mock_data()], [generate_mock_data_simstudy()], [mock_spec()]
#' @export
create_mock_data <- function(databaseStart,
variables,
Expand Down Expand Up @@ -155,6 +271,31 @@ create_mock_data <- function(databaseStart,
stop("variables must have a 'variableType' column")
}

# ========== v0.4 PIPELINE PATH ==========

if (!isTRUE(validate)) {
if (verbose) {
message("validate = FALSE requested; using legacy create_* dispatch.")
}
} else if (is.null(variable_details)) {
if (verbose) {
message("variable_details = NULL; using legacy create_* fallback dispatch.")
}
} else {
v04_result <- .create_mock_data_v04(
databaseStart = databaseStart,
variables = variables,
variable_details = variable_details,
n = n,
seed = seed,
verbose = verbose
)

if (!is.null(v04_result)) {
return(v04_result)
}
}

# ========== FILTER FOR ENABLED VARIABLES ==========

if (verbose) message("Filtering for enabled variables...")
Expand Down
Loading
Loading