|
| 1 | +#' |
| 2 | +#' @title Missing data pattern with disclosure control |
| 3 | +#' @description This function is a serverside aggregate function that computes the |
| 4 | +#' missing data pattern using mice::md.pattern and applies disclosure control to |
| 5 | +#' prevent revealing small cell counts. |
| 6 | +#' @details This function calls the mice::md.pattern function to generate a matrix |
| 7 | +#' showing the missing data patterns in the input data. To ensure disclosure control, |
| 8 | +#' any pattern counts that are below the threshold (nfilter.tab, default=3) are |
| 9 | +#' suppressed. |
| 10 | +#' |
| 11 | +#' \strong{Suppression Method:} |
| 12 | +#' |
| 13 | +#' When a pattern count is below threshold: |
| 14 | +#' - Row name is changed to "suppressed(<N>)" where N is the threshold |
| 15 | +#' - All pattern values in that row are set to NA |
| 16 | +#' - Summary row is also set to NA (prevents back-calculation) |
| 17 | +#' |
| 18 | +#' \strong{Output Matrix Structure:} |
| 19 | +#' |
| 20 | +#' - Rows represent different missing data patterns (plus a summary row at the bottom) |
| 21 | +#' - Row names contain pattern counts (or "suppressed(<N>)" for invalid patterns) |
| 22 | +#' - Columns show 1 if variable is observed, 0 if missing |
| 23 | +#' - Last column shows total number of missing values per pattern |
| 24 | +#' - Last row shows total number of missing values per variable |
| 25 | +#' |
| 26 | +#' \strong{Note for Pooling:} |
| 27 | +#' |
| 28 | +#' When this function is called from ds.mdPattern with type='combine', suppressed |
| 29 | +#' patterns are excluded from pooling to prevent disclosure through subtraction. |
| 30 | +#' This means pooled counts may underestimate the true total when patterns are |
| 31 | +#' suppressed in some studies. |
| 32 | +#' |
| 33 | +#' @param x a character string specifying the name of a data frame or matrix |
| 34 | +#' containing the data to analyze for missing patterns. |
| 35 | +#' @return A list containing: |
| 36 | +#' \item{pattern}{The missing data pattern matrix with disclosure control applied} |
| 37 | +#' \item{valid}{Logical indicating if all patterns meet disclosure requirements} |
| 38 | +#' \item{message}{A message describing the validity status} |
| 39 | +#' @author Xavier Escribà montagut for DataSHIELD Development Team |
| 40 | +#' @import mice |
| 41 | +#' @export |
| 42 | +#' |
| 43 | +mdPatternDS <- function(x){ |
| 44 | + |
| 45 | + ############################################################# |
| 46 | + # MODULE 1: CAPTURE THE nfilter SETTINGS |
| 47 | + thr <- dsBase::listDisclosureSettingsDS() |
| 48 | + nfilter.tab <- as.numeric(thr$nfilter.tab) |
| 49 | + ############################################################# |
| 50 | + |
| 51 | + # Parse the input data name with error handling |
| 52 | + x.val <- tryCatch( |
| 53 | + { |
| 54 | + eval(parse(text=x), envir = parent.frame()) |
| 55 | + }, |
| 56 | + error = function(e) { |
| 57 | + stop(paste0("Object '", x, "' does not exist on the server"), call. = FALSE) |
| 58 | + } |
| 59 | + ) |
| 60 | + |
| 61 | + # Check object class |
| 62 | + typ <- class(x.val) |
| 63 | + |
| 64 | + # Check that input is a data frame or matrix |
| 65 | + if(!("data.frame" %in% typ || "matrix" %in% typ)){ |
| 66 | + stop(paste0("The input object must be of type 'data.frame' or 'matrix'. Current type: ", |
| 67 | + paste(typ, collapse = ", ")), call. = FALSE) |
| 68 | + } |
| 69 | + |
| 70 | + # Use x.val for further processing |
| 71 | + x <- x.val |
| 72 | + |
| 73 | + # Call mice::md.pattern with plot=FALSE |
| 74 | + pattern <- mice::md.pattern(x, plot = FALSE) |
| 75 | + |
| 76 | + # Apply disclosure control |
| 77 | + # Pattern counts are stored in row names (except last row which is empty/summary) |
| 78 | + # The last row contains variable-level missing counts |
| 79 | + |
| 80 | + validity <- "valid" |
| 81 | + n_patterns <- nrow(pattern) - 1 # exclude the summary row |
| 82 | + |
| 83 | + if(n_patterns > 0){ |
| 84 | + # Check pattern counts (stored in row names, excluding last row) |
| 85 | + pattern_counts <- as.numeric(rownames(pattern)[1:n_patterns]) |
| 86 | + |
| 87 | + # Find patterns with counts below threshold |
| 88 | + invalid_idx <- which(pattern_counts > 0 & pattern_counts < nfilter.tab) |
| 89 | + |
| 90 | + if(length(invalid_idx) > 0){ |
| 91 | + validity <- "invalid" |
| 92 | + |
| 93 | + # For invalid patterns, suppress by: |
| 94 | + # - Setting row name to "suppressed" |
| 95 | + # - Setting all pattern values to NA |
| 96 | + rnames <- rownames(pattern) |
| 97 | + for(idx in invalid_idx){ |
| 98 | + rnames[idx] <- paste0("suppressed(<", nfilter.tab, ")") |
| 99 | + pattern[idx, ] <- NA |
| 100 | + } |
| 101 | + rownames(pattern) <- rnames |
| 102 | + |
| 103 | + # Also need to recalculate the last row (summary) if patterns were suppressed |
| 104 | + # Set to NA to avoid disclosures |
| 105 | + pattern[nrow(pattern), seq_len(ncol(pattern))] <- NA |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + # Return the pattern with validity information |
| 110 | + return(list( |
| 111 | + pattern = pattern, |
| 112 | + valid = (validity == "valid"), |
| 113 | + message = ifelse(validity == "valid", |
| 114 | + "Valid: all pattern counts meet disclosure requirements", |
| 115 | + paste0("Invalid: some pattern counts below threshold (", |
| 116 | + nfilter.tab, ") have been suppressed")) |
| 117 | + )) |
| 118 | +} |
| 119 | + |
| 120 | +#AGGREGATE FUNCTION |
| 121 | +# mdPatternDS |
0 commit comments