diff --git a/code/snt_utils.r b/code/snt_utils.r index 8efda14..838e276 100644 --- a/code/snt_utils.r +++ b/code/snt_utils.r @@ -58,6 +58,114 @@ install_and_load <- function(packages) { print(loaded_packages) } +# Load SNT configuration file with a consistent error message +load_snt_config <- function(config_path, config_file_name = "SNT_config.json") { + config_file <- file.path(config_path, config_file_name) + config_json <- tryCatch( + { + jsonlite::fromJSON(config_file) + }, + error = function(e) { + msg <- paste0("[ERROR] Error while loading configuration ", conditionMessage(e)) + stop(msg) + } + ) + return(config_json) +} + +# Validate that required keys exist in a config section +validate_required_config_keys <- function(config_json, keys, section = "SNT_CONFIG") { + if (is.null(config_json[[section]])) { + stop(paste0("[ERROR] Missing configuration section: ", section)) + } + + missing_keys <- keys[!keys %in% names(config_json[[section]])] + if (length(missing_keys) > 0) { + stop(paste0("[ERROR] Missing configuration input(s): ", paste(missing_keys, collapse = ", "))) + } + + invisible(TRUE) +} + +# Generic helper to load a country-specific dataset file +load_country_file_from_dataset <- function(dataset_id, country_code, suffix, label = NULL) { + file_name <- paste0(country_code, suffix) + output_data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_id, file_name) + }, + error = function(e) { + target_label <- if (is.null(label)) file_name else label + msg <- paste0( + "[ERROR] Error while loading ", + target_label, + " (dataset: ", + dataset_id, + ", file: ", + file_name, + "): ", + conditionMessage(e) + ) + stop(msg) + } + ) + + log_msg(paste0("Loaded file `", file_name, "` from dataset `", dataset_id, "`.")) + return(output_data) +} + +# Ensure YEAR and MONTH are stored as integers when present +normalize_year_month_types <- function(input_df, year_col = "YEAR", month_col = "MONTH") { + output_df <- input_df + if (year_col %in% names(output_df)) { + output_df[[year_col]] <- as.integer(output_df[[year_col]]) + } + if (month_col %in% names(output_df)) { + output_df[[month_col]] <- as.integer(output_df[[month_col]]) + } + return(output_df) +} + +# Standard routine preparation: select, pivot longer, optional deduplication +prepare_routine_long <- function(routine_df, fixed_cols, indicators, deduplicate = TRUE) { + cols_to_select <- intersect(c(fixed_cols, indicators), names(routine_df)) + missing_indicators <- setdiff(indicators, names(routine_df)) + if (length(missing_indicators) > 0) { + stop(paste0("[ERROR] Missing indicator column(s): ", paste(missing_indicators, collapse = ", "))) + } + + routine_long <- routine_df %>% + dplyr::select(dplyr::all_of(cols_to_select)) %>% + tidyr::pivot_longer( + cols = dplyr::all_of(indicators), + names_to = "INDICATOR", + values_to = "VALUE" + ) + + if (deduplicate) { + dedup_keys <- intersect(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "YEAR", "MONTH", "INDICATOR"), names(routine_long)) + routine_long <- routine_long %>% + dplyr::distinct(dplyr::across(dplyr::all_of(dedup_keys)), .keep_all = TRUE) + } + + return(routine_long) +} + +# Build a standardized output path under /data and create it if needed +standard_output_path <- function(data_root_path, domain, subdomain = NULL, create_dir = TRUE) { + target_path <- if (is.null(subdomain) || nchar(subdomain) == 0) { + file.path(data_root_path, domain) + } else { + file.path(data_root_path, domain, subdomain) + } + + if (create_dir && !dir.exists(target_path)) { + dir.create(target_path, recursive = TRUE, showWarnings = FALSE) + } + + return(target_path) +} + # Helper to safely extract values from parameters (allows to specify the type) get_param <- function(params_list, target_param, default, cast_method = identity) { #' Safely retrieve a parameter if it exists in the input, using a default fallback if it doesn't exist in the inupt diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb index 9d03d45..e91dd17 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb @@ -59,24 +59,16 @@ "source": [ "# Project folders (ROOT_PATH injected by pipeline if not set)\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_iqr\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared helpers for this pipeline (code)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_iqr.r\"))\n", + "setup_ctx <- bootstrap_iqr_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c( \"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, { @@ -120,15 +112,9 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", + "log_msg(glue(\"SNT configuration loaded from : {file.path(setup_ctx$CONFIG_PATH, 'SNT_config.json')}\"))" ] }, { @@ -142,16 +128,7 @@ }, "outputs": [], "source": [ - "# Check SNT configuration \n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") \n", - "for (conf in snt_config_mandatory) {\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", + "# Configuration validation is handled in pipeline.py\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", @@ -190,15 +167,12 @@ "source": [ "# Load file from dataset (formatting)\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", + "dhis2_routine <- load_routine_data(\n", + " dataset_name = dataset_name,\n", + " country_code = COUNTRY_CODE,\n", + " required_indicators = DHIS2_INDICATORS\n", + ")\n", "\n", - "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", - "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", "head(dhis2_routine, 2)" ] @@ -214,8 +188,7 @@ }, "outputs": [], "source": [ - "# YEAR and MONTH should be integers; in the input data they are numeric, but we later use them as integers\n", - "dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)" + "# YEAR/MONTH casting is handled inside load_routine_data()." ] }, { @@ -237,14 +210,7 @@ }, "outputs": [], "source": [ - "# Raise an error if any of DHIS2_INDICATORS are not present in the dhis2 routine data.\n", - "for (ind in DHIS2_INDICATORS) {\n", - " if (!(ind %in% colnames(dhis2_routine))) {\n", - " msg <- paste(\"[ERROR] Missing indicator column in routine data: \", ind)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}" + "# Indicator validation is handled inside load_routine_data()." ] }, { @@ -536,23 +502,8 @@ }, "outputs": [], "source": [ - "# Define helper function to compute moving average for an outlier column\n", - "start_time <- Sys.time()\n", - "\n", - "impute_outliers_dt <- function(dt, outlier_col) {\n", - " dt <- as.data.table(dt) # transform to datatable\n", - " setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) \n", - " dt[, TO_IMPUTE := fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] # Compute TO_IMPUTE column\n", - " \n", - " # Fast rolling mean by group\n", - " dt[, MOVING_AVG := frollapply(TO_IMPUTE, n = 3, FUN = function(x) ceiling(mean(x, na.rm = TRUE)), align = \"center\"), \n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)]\n", - " \n", - " dt[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] \n", - " dt[, c(\"TO_IMPUTE\") := NULL] # clean up \"MOVING_AVG\"\n", - " \n", - " return(as.data.frame(copy(dt)))\n", - "}" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr.r\n", + "start_time <- Sys.time()" ] }, { @@ -629,24 +580,7 @@ }, "outputs": [], "source": [ - "# Define helper function to format both versions \n", - "format_routine_data_selection <- function(df, outlier_column, remove = FALSE) {\n", - " \n", - " # remove outliers \n", - " if (remove) df <- df %>% filter(!.data[[outlier_column]])\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", DHIS2_INDICATORS)\n", - " \n", - " output <- df %>%\n", - " select(-VALUE) %>%\n", - " rename(VALUE = VALUE_IMPUTED) %>%\n", - " select(all_of(fixed_cols), INDICATOR, VALUE) %>% # global: fixed_cols\n", - " mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>%\n", - " pivot_wider(names_from = \"INDICATOR\", values_from = \"VALUE\") %>%\n", - " left_join(pyramid_names, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"))\n", - "\n", - " output %>% select(all_of(intersect(target_cols, names(output))))\n", - "}" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr.r" ] }, { @@ -661,8 +595,22 @@ "outputs": [], "source": [ "# Format IQR tables (imputed and removed)\n", - "dhis2_routine_iqr_imputed <- format_routine_data_selection(dhis2_routine_outliers_iqr_imputed, iqr_column)\n", - "dhis2_routine_iqr_removed <- format_routine_data_selection(dhis2_routine_outliers_iqr_imputed, iqr_column, remove = TRUE)" + "dhis2_routine_iqr_imputed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_iqr_imputed,\n", + " outlier_column = iqr_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names\n", + ")\n", + "\n", + "dhis2_routine_iqr_removed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_iqr_imputed,\n", + " outlier_column = iqr_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names,\n", + " remove = TRUE\n", + ")" ] }, { @@ -714,7 +662,7 @@ }, "outputs": [], "source": [ - "output_path <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", + "output_path <- OUTPUT_DIR\n", "\n", "# IQR detection table (for DB and reporting)\n", "outlier_col <- colnames(dhis2_routine_outliers_selection)[startsWith(colnames(dhis2_routine_outliers_selection), \"OUTLIER_\")][1]\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb index 5e9c896..bfb371d 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb @@ -35,22 +35,14 @@ "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_iqr\")\n", "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Shared helpers for this pipeline (report)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_iqr_report.r\"))\n", + "setup_ctx <- bootstrap_iqr_context(\n", + " root_path = SNT_ROOT_PATH,\n", + " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + ")" ] }, { @@ -64,13 +56,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "\n", "# Configuration variables\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", @@ -94,10 +81,7 @@ }, "outputs": [], "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" ] }, { @@ -231,76 +215,9 @@ }, "outputs": [], "source": [ - "#--- FUNCTIONS TO MAKE ONE PLOT ---\n", - "plot_outliers <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>% filter(INDICATOR == ind_name)\n", - "\n", - " # Remove infinite or impossible values explicitly → removes warnings\n", - " df_ind <- df_ind %>% \n", - " filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE))\n", - "\n", - " p <- ggplot(df_ind, aes(x = YEAR, y = VALUE)) +\n", - " \n", - " # All values (grey)\n", - " geom_point(alpha = 0.25, color = \"grey40\", na.rm = TRUE) +\n", - " \n", - " # Outliers (red)\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " aes(x = YEAR, y = VALUE),\n", - " color = \"red\",\n", - " size = 2.8,\n", - " alpha = 0.85,\n", - " na.rm = TRUE\n", - " ) +\n", - " \n", - " labs(\n", - " title = paste(\"Inspection des valeurs aberrantes pour indicateur:\", ind_name),\n", - " subtitle = \"Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées\",\n", - " x = \"Année\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 14)\n", - "\n", - " return(p)\n", - "}\n", - "\n", - "#plots <- map(unique_inds, ~ plot_outliers(.x, routine_data, outlier_col))\n", - "#walk(plots, print)\n", - "\n", - "plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>%\n", - " filter(\n", - " INDICATOR == ind_name,\n", - " !is.na(YEAR),\n", - " !is.na(VALUE),\n", - " is.finite(VALUE)\n", - " )\n", - " \n", - " if (nrow(df_ind) == 0) return(NULL)\n", - " \n", - " ggplot(df_ind, aes(x = ADM2_ID, y = VALUE)) +\n", - " geom_point(color = \"grey60\", alpha = 0.3) +\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " color = \"red\", \n", - " size = 2.8,\n", - " alpha = 0.85\n", - " ) +\n", - " facet_wrap(~ YEAR, scales = \"free_y\") +\n", - " labs(\n", - " title = paste(\"Détection des valeurs aberrantes —\", ind_name),\n", - " subtitle = paste(\"Méthode :\", outlier_col, \"| Rouge = valeur aberrante\"),\n", - " x = \"District (ADM2)\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 75, hjust = 1, size = 7)\n", - " )\n", - "}" + "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r\n", + "# - plot_outliers()\n", + "# - plot_outliers_by_district_facet_year()" ] }, { @@ -556,24 +473,10 @@ }, "outputs": [], "source": [ - "# ---- 0. Define the checks, columns and labels ----\n", - "checks <- list(\n", - " allout_susp = c(\"ALLOUT\", \"SUSP\"), \n", - " allout_test = c(\"ALLOUT\", \"TEST\"), \n", - " susp_test = c(\"SUSP\", \"TEST\"), \n", - " test_conf = c(\"TEST\", \"CONF\"), \n", - " conf_treat = c(\"CONF\", \"MALTREAT\"), \n", - " adm_dth = c(\"MALADM\", \"MALDTH\") \n", - ")\n", - "\n", - "check_labels <- c(\n", - " pct_coherent_allout_susp = \"Ambulatoire ≥ Suspects\",\n", - " pct_coherent_allout_test = \"Ambulatoire ≥ Testés\",\n", - " pct_coherent_susp_test = \"Suspects ≥ Testés\",\n", - " pct_coherent_test_conf = \"Testés ≥ Confirmés\",\n", - " pct_coherent_conf_treat = \"Confirmés ≥ Traités\",\n", - " pct_coherent_adm_dth = \"Admissions Palu ≥ Décès Palu\"\n", - ")" + "# Coherence definitions loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r\n", + "defs <- get_coherence_definitions()\n", + "checks <- defs$checks\n", + "check_labels <- defs$check_labels" ] }, { @@ -587,83 +490,14 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency checks dynamically ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# ---- 2. Summarise percent coherent per year ----\n", - "check_cols <- intersect(paste0(\"check_\", names(checks)), names(df_checks))\n", - "\n", - "coherency_metrics <- df_checks %>%\n", - " group_by(YEAR) %>%\n", - " summarise(\n", - " across(all_of(check_cols), ~ mean(.x, na.rm = TRUE) * 100,\n", - " .names = \"pct_{.col}\"),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"check_type\",\n", - " names_prefix = \"pct_check_\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent)) %>% # <-- remove missing checks entirely\n", - " mutate(\n", - " check_label = recode(\n", - " check_type,\n", - " !!!setNames(check_labels, sub(\"^pct_coherent_\", \"\", names(check_labels)))\n", - " ),\n", - " check_label = factor(check_label, levels = unique(check_label)), # preserve only existing levels\n", - " check_label = fct_reorder(check_label, pct_coherent, .fun = median, na.rm = TRUE)\n", - " )\n", - "\n", - "# ---- 3. Heatmap ----\n", - "coherency_plot <- ggplot(coherency_metrics, aes(\n", - " x = factor(YEAR),\n", - " y = check_label,\n", - " fill = pct_coherent\n", - ")) +\n", - " geom_tile(color = NA, width = 0.88, height = 0.88) +\n", - " geom_text(\n", - " aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " color = \"white\",\n", - " fontface = \"bold\",\n", - " size = 5\n", - " ) +\n", - " scale_fill_viridis(\n", - " name = \"% Cohérent\",\n", - " option = \"viridis\",\n", - " limits = c(0, 100),\n", - " direction = -1\n", - " ) +\n", - " labs(\n", - " title = \"Contrôles de cohérence des données (niveau national)\",\n", - " x = \"Année\",\n", - " y = NULL\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " plot.title = element_text(size = 22, face = \"bold\", hjust = 0.5),\n", - " axis.text.y = element_text(size = 16, hjust = 0),\n", - " axis.text.x = element_text(size = 16),\n", - " legend.title = element_text(size = 16, face = \"bold\"),\n", - " legend.text = element_text(size = 14),\n", - " legend.key.width = unit(0.7, \"cm\"),\n", - " legend.key.height = unit(1.2, \"cm\")\n", - " )\n", + "# National coherence summary and plot via report utils\n", + "coherency_metrics <- compute_national_coherency_metrics(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels\n", + ")\n", "\n", + "coherency_plot <- plot_national_coherence_heatmap(coherency_metrics)\n", "coherency_plot" ] }, @@ -686,52 +520,16 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency check per row safely ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA_real_)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# Identify the check columns that actually exist\n", - "check_cols <- names(df_checks)[grepl(\"^check_\", names(df_checks))]\n", - "\n", - "valid_checks <- check_cols[\n", - " purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x)))\n", - "]\n", - "\n", - "# Compute coherence\n", - "adm_coherence <- df_checks %>%\n", - " group_by(ADM1_NAME, ADM2_NAME, ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " total_reports = n(),\n", - " !!!purrr::map(\n", - " valid_checks,\n", - " ~ expr(100 * mean(.data[[.x]], na.rm = TRUE))\n", - " ) %>%\n", - " setNames(paste0(\"pct_coherent_\", sub(\"^check_\", \"\", valid_checks))),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " filter(total_reports >= 5)\n", - "\n", - "# To long format\n", - "adm_long <- adm_coherence %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_coherent_\"),\n", - " names_to = \"check_type\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent))\n", + "# ADM coherence summaries via report utils\n", + "adm_coherence_data <- compute_adm_coherence_long(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels,\n", + " min_reports = 5\n", + ")\n", "\n", - "adm_long <- adm_long %>% mutate(check_label = recode(check_type, !!!check_labels))\n", + "adm_coherence <- adm_coherence_data$adm_coherence\n", + "adm_long <- adm_coherence_data$adm_long\n", "\n", "head(adm_long)" ] @@ -747,69 +545,7 @@ }, "outputs": [], "source": [ - "# Define heatmap function\n", - "plot_coherence_heatmap <- function(df, selected_year, agg_level = \"ADM1_NAME\", filename = NULL, do_plot = TRUE) {\n", - " \n", - " if (!agg_level %in% names(df)) {\n", - " stop(paste0(\"Aggregation level '\", agg_level, \"' not found in data!\"))\n", - " }\n", - " \n", - " # Aggregate pct_coherent by chosen level + check_label\n", - " df_year <- df %>%\n", - " filter(YEAR == selected_year) %>%\n", - " group_by(across(all_of(c(agg_level, \"check_label\")))) %>%\n", - " summarise(\n", - " pct_coherent = mean(pct_coherent, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " group_by(across(all_of(agg_level))) %>%\n", - " mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>%\n", - " ungroup() %>%\n", - " mutate(!!agg_level := fct_reorder(.data[[agg_level]], median_coh))\n", - " \n", - " n_units <- n_distinct(df_year[[agg_level]])\n", - " plot_height <- max(6, 0.5 * n_units) # dynamically adjust height\n", - " agg_label <- if (agg_level == \"ADM1_NAME\") {\n", - " \"niveau administratif 1\"\n", - " } else if (agg_level == \"ADM2_NAME\") {\n", - " \"niveau administratif 2\"\n", - " } else {\n", - " agg_level # fallback, in case a different level is passed\n", - " }\n", - " \n", - " p <- ggplot(df_year, aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) +\n", - " geom_tile(color = \"white\", linewidth = 0.2) +\n", - " geom_text(aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " size = 5, fontface = \"bold\", color = \"white\") +\n", - " scale_fill_viridis(name = \"% cohérent\", limits = c(0, 100),\n", - " option = \"viridis\", direction = -1) +\n", - " labs(\n", - " title = paste0(\"Cohérence des données par \", agg_label, \" - \", selected_year),\n", - " x = \"Règle de cohérence\",\n", - " y = agg_label\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " axis.text.y = element_text(size = 12),\n", - " axis.text.x = element_text(size = 12, angle = 30, hjust = 1),\n", - " plot.title = element_text(size = 16, face = \"bold\", hjust = 0.5),\n", - " legend.title = element_text(size = 12),\n", - " legend.text = element_text(size = 10)\n", - " )\n", - " \n", - " # Adjust notebook display\n", - " options(repr.plot.width = 14, repr.plot.height = plot_height)\n", - " \n", - " # Save if filename is provided\n", - " if (!is.null(filename)) {\n", - " ggsave(filename = filename, plot = p,\n", - " width = 14, height = plot_height, dpi = 300,\n", - " limitsize = FALSE)\n", - " }\n", - " if (do_plot) { print(p) }\n", - " # return(p)\n", - "}" + "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" ] }, { @@ -876,43 +612,7 @@ }, "outputs": [], "source": [ - "# Define function\n", - "plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) {\n", - " \n", - " # Check if column exists\n", - " if (!col_name %in% names(map_data)) {\n", - " stop(paste0(\"Column '\", col_name, \"' not found in the data!\"))\n", - " }\n", - " \n", - " # Default legend title if not provided\n", - " if (is.null(indicator_label)) {\n", - " indicator_label <- col_name\n", - " }\n", - " \n", - " ggplot(map_data) +\n", - " geom_sf(aes(fill = .data[[col_name]]), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis(\n", - " name = paste0(\"% cohérence\\n(\", indicator_label, \")\"),\n", - " option = \"magma\",\n", - " direction = -1,\n", - " limits = c(0, 100),\n", - " na.value = \"grey90\"\n", - " ) +\n", - " # facet_wrap(~ YEAR) +\n", - " facet_wrap(~ YEAR, drop = TRUE) +\n", - " labs(\n", - " title = \"Cohérence des données par niveau administratif 2 et par année\",\n", - " subtitle = paste(\"Indicateur :\", indicator_label),\n", - " caption = \"Source : DHIS2 données routinières\"\n", - " ) +\n", - " theme_minimal(base_size = 15) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " strip.text = element_text(size = 14, face = \"bold\"),\n", - " plot.title = element_text(size = 20, face = \"bold\"),\n", - " legend.position = \"right\"\n", - " )\n", - "}\n" + "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r new file mode 100644 index 0000000..e480314 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r @@ -0,0 +1,174 @@ +# Shared bootstrap for the IQR outliers pipeline notebooks. +# +# Function docs use lightweight R comments to keep notebooks readable +# while documenting expected inputs/outputs for analysts and maintainers. + +#' Initialize runtime context for the IQR outliers pipeline. +#' +#' Creates standard project paths, loads shared dependencies and utilities, +#' initializes OpenHEXA SDK access, loads SNT configuration, and returns a +#' single context object used by notebooks. +#' +#' @param root_path Project root folder (workspace). +#' @param required_packages Character vector of R packages to install/load. +#' @param load_openhexa Logical; import OpenHEXA SDK when TRUE. +#' @return Named list with paths, OpenHEXA handle, and parsed config. +bootstrap_iqr_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + output_dir <- file.path(data_path, "dhis2", "outliers_imputation") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + # snt_utils::log_msg() expects a global `openhexa` object. + assign("openhexa", openhexa, envir = .GlobalEnv) + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + OUTPUT_DIR = output_dir, + openhexa = openhexa, + config_json = config_json + )) +} + +#' Load DHIS2 routine input data with validation and logging. +#' +#' Reads the latest routine parquet file from OpenHEXA, logs dataset details, +#' optionally casts YEAR and MONTH to integers, and validates indicator columns. +#' Stops execution with a clear error when required fields are missing. +#' +#' @param dataset_name OpenHEXA dataset identifier/name. +#' @param country_code Country code used in routine filename prefix. +#' @param required_indicators Optional character vector of required indicators. +#' @param cast_year_month Logical; cast YEAR/MONTH columns to integer. +#' @return Data frame containing validated routine data. +load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (cast_year_month && all(c("YEAR", "MONTH") %in% colnames(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + if (!is.null(required_indicators)) { + missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) + if (length(missing_indicators) > 0) { + msg <- paste("[ERROR] Missing indicator column(s) in routine data:", paste(missing_indicators, collapse = ", ")) + log_msg(msg) + stop(msg) + } + } + + dhis2_routine +} + +#' Impute flagged outliers using a centered moving average. +#' +#' For each ADM/OU/indicator time series, values marked as outliers are +#' replaced by a 3-point centered moving average (ceiling), preserving +#' non-outlier observations. +#' +#' @param dt Routine data in long format. +#' @param outlier_col Name of the logical outlier flag column. +#' @return Data frame with VALUE_IMPUTED column and helper columns removed. +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + + dt[, MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + dt[, c("TO_IMPUTE") := NULL] + + return(as.data.frame(data.table::copy(dt))) +} + +#' Build final routine output tables (imputed or removed). +#' +#' Reshapes long-format routine values back to wide indicator columns, joins +#' location names, and standardizes output columns expected by downstream +#' datasets and reporting. +#' +#' @param df Long-format routine data including VALUE_IMPUTED. +#' @param outlier_column Outlier flag column used to filter removed records. +#' @param DHIS2_INDICATORS Indicator columns to keep in the final table. +#' @param fixed_cols Fixed identifier/date columns in long format. +#' @param pyramid_names Mapping table with ADM/OU names. +#' @param remove Logical; when TRUE returns outlier-removed data. +#' @return Wide routine data frame ready for export. +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r new file mode 100644 index 0000000..38de5d1 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r @@ -0,0 +1,309 @@ +# Report helpers for the IQR outliers pipeline. + +`%||%` <- function(x, y) if (!is.null(x)) x else y + +# Pull in bootstrap + shared non-report helpers (same folder). +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") +} else if (!is.na(.this_file)) { + dirname(.this_file) +} else { + getwd() +} +source(file.path(.this_dir, "snt_dhis2_outliers_imputation_iqr.r")) + +printdim <- function(df, name = deparse(substitute(df))) { + if (is.null(df)) { + message(sprintf("%s: NULL", name)) + return(invisible(NULL)) + } + d <- dim(df) + message(sprintf("%s: %s x %s", name, d[1], d[2])) + invisible(d) +} + +plot_outliers <- function(ind_name, df, outlier_col = "OUTLIER_DETECTED") { + if (!ind_name %in% names(df)) return(NULL) + if (!outlier_col %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate( + YEAR = as.integer(.data$YEAR %||% substr(.data$PERIOD, 1, 4)), + MONTH = as.integer(.data$MONTH %||% substr(.data$PERIOD, 5, 6)), + DATE = as.Date(sprintf("%04d-%02d-01", YEAR, MONTH)) + ) %>% + dplyr::group_by(.data$DATE) %>% + dplyr::summarise( + value = sum(.data[[ind_name]], na.rm = TRUE), + has_outlier = any(.data[[outlier_col]] %in% TRUE, na.rm = TRUE), + .groups = "drop" + ) + + ggplot2::ggplot(d, ggplot2::aes(x = .data$DATE, y = .data$value)) + + ggplot2::geom_line(linewidth = 0.8, color = "grey40") + + ggplot2::geom_point(ggplot2::aes(color = .data$has_outlier), size = 2, alpha = 0.9) + + ggplot2::scale_color_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "#0072B2")) + + ggplot2::labs( + title = sprintf("Outliers - %s (%s)", ind_name, outlier_col), + x = "Mois", + y = "Valeur agregee", + color = "Outlier present" + ) + + ggplot2::theme_minimal(base_size = 14) +} + +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col = "OUTLIER_DETECTED") { + if (!ind_name %in% names(df)) return(NULL) + if (!outlier_col %in% names(df)) return(NULL) + if (!("ADM2_NAME" %in% names(df) && "ADM2_ID" %in% names(df))) return(NULL) + + d <- df %>% + dplyr::mutate( + YEAR = as.integer(.data$YEAR %||% substr(.data$PERIOD, 1, 4)), + MONTH = as.integer(.data$MONTH %||% substr(.data$PERIOD, 5, 6)), + DATE = as.Date(sprintf("%04d-%02d-01", YEAR, MONTH)) + ) %>% + dplyr::group_by(.data$ADM2_ID, .data$ADM2_NAME, .data$YEAR, .data$MONTH, .data$DATE) %>% + dplyr::summarise( + value = sum(.data[[ind_name]], na.rm = TRUE), + has_outlier = any(.data[[outlier_col]] %in% TRUE, na.rm = TRUE), + .groups = "drop" + ) + + if (nrow(d) == 0) return(NULL) + + ggplot2::ggplot( + d, + ggplot2::aes(x = .data$DATE, y = .data$value, group = .data$ADM2_ID) + ) + + ggplot2::geom_line(alpha = 0.35, linewidth = 0.4, color = "grey40") + + ggplot2::geom_point(ggplot2::aes(color = .data$has_outlier), alpha = 0.75, size = 1) + + ggplot2::scale_color_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "grey70")) + + ggplot2::facet_wrap(~YEAR, scales = "free_x") + + ggplot2::labs( + title = sprintf("Outliers par district - %s (%s)", ind_name, outlier_col), + x = "Mois", + y = "Valeur (ADM2 agrege)", + color = "Outlier" + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme( + legend.position = "bottom", + strip.text = ggplot2::element_text(face = "bold") + ) +} + +plot_coherence_heatmap <- function( + df, + selected_year, + agg_level = "ADM1_NAME", + filename = NULL, + do_plot = TRUE +) { + if (!all(c("YEAR", "check_label", "pct_coherent") %in% names(df))) return(NULL) + if (!agg_level %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate(YEAR = as.integer(.data$YEAR)) %>% + dplyr::filter(.data$YEAR == as.integer(selected_year)) %>% + dplyr::mutate( + agg = as.character(.data[[agg_level]]), + check_label = as.character(.data$check_label) + ) + + if (nrow(d) == 0) return(NULL) + + p <- ggplot2::ggplot(d, ggplot2::aes( + x = .data$check_label, + y = .data$agg, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile() + + ggplot2::scale_fill_viridis_c( + name = "% coherent", + option = "viridis", + limits = c(0, 100) + ) + + ggplot2::labs( + title = sprintf("Coherence (%s) - %s", agg_level, selected_year), + x = NULL, + y = NULL + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), + plot.title = ggplot2::element_text(face = "bold") + ) + + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = 8, dpi = 150) + } + + if (do_plot) print(p) + invisible(p) +} + +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!inherits(map_data, "sf")) return(NULL) + if (!col_name %in% names(map_data)) return(NULL) + + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = NA) + + ggplot2::scale_fill_viridis_c( + option = "viridis", + name = indicator_label %||% col_name, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::labs(title = indicator_label %||% col_name) + + ggplot2::theme_void(base_size = 12) + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", hjust = 0.5), + legend.position = "right" + ) +} + +get_coherence_definitions <- function() { + checks <- list( + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") + ) + + check_labels <- c( + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" + ) + + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) + ) +} + +plot_national_coherence_heatmap <- function(coherency_metrics) { + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 + ) + + ggplot2::labs( + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") + ) +} + +compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) + ) %>% + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index 60cfa92..65c10ec 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -26,8 +26,6 @@ "# Parameters with safe defaults\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", "if (!exists(\"CONFIG_FILE_NAME\")) CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "# Partial step is always required by MG logic; complete step is optional.\n", - "RUN_MAGIC_GLASSES_PARTIAL <- TRUE\n", "if (!exists(\"RUN_MAGIC_GLASSES_COMPLETE\")) RUN_MAGIC_GLASSES_COMPLETE <- FALSE\n", "if (!exists(\"DEVIATION_MAD15\")) DEVIATION_MAD15 <- 15\n", "if (!exists(\"DEVIATION_MAD10\")) DEVIATION_MAD10 <- 10\n", @@ -43,13 +41,7 @@ "\n", "if (DEV_SUBSET_ADM1_N < 1) {\n", " stop(\"DEV_SUBSET_ADM1_N must be >= 1\")\n", - "}\n", - "\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", - "OUTPUT_DIR <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", - "dir.create(OUTPUT_DIR, recursive = TRUE, showWarnings = FALSE)" + "}" ] }, { @@ -63,64 +55,25 @@ }, "outputs": [], "source": [ - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "required_packages <- c(\"arrow\", \"data.table\", \"jsonlite\", \"reticulate\", \"glue\")\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " required_packages <- c(required_packages, \"forecast\")\n", - "}\n", - "if (RUN_MAGIC_GLASSES_COMPLETE && SEASONAL_WORKERS > 1) {\n", - " required_packages <- c(required_packages, \"future\", \"future.apply\")\n", - "}\n", - "install_and_load(unique(required_packages))\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " log_msg(\"[WARNING] Complete mode: seasonal detection is very computationally intensive and can take several hours to run.\", \"warning\")\n", - "}\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE && SEASONAL_WORKERS > 1) {\n", - " future::plan(future::multisession, workers = SEASONAL_WORKERS)\n", - " log_msg(glue::glue(\"Using parallel seasonal detection with {SEASONAL_WORKERS} workers\"))\n", - "}\n", - "\n", - "config_json <- fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME))\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "fixed_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\")\n", - "indicators_to_keep <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\"))\n", - "\n", - "cols_to_select <- intersect(c(fixed_cols, indicators_to_keep), names(dhis2_routine))\n", - "dt_routine <- as.data.table(dhis2_routine)[, ..cols_to_select]\n", - "\n", - "dhis2_routine_long <- melt(\n", - " dt_routine,\n", - " id.vars = intersect(fixed_cols, names(dt_routine)),\n", - " measure.vars = intersect(indicators_to_keep, names(dt_routine)),\n", - " variable.name = \"INDICATOR\",\n", - " value.name = \"VALUE\",\n", - " variable.factor = FALSE\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_magic_glasses.r\"))\n", + "\n", + "mg_input <- prepare_magic_glasses_input(\n", + " root_path = ROOT_PATH,\n", + " config_file_name = CONFIG_FILE_NAME,\n", + " run_complete = RUN_MAGIC_GLASSES_COMPLETE,\n", + " seasonal_workers = SEASONAL_WORKERS,\n", + " dev_subset = DEV_SUBSET,\n", + " dev_subset_adm1_n = DEV_SUBSET_ADM1_N\n", ")\n", "\n", - "if (DEV_SUBSET) {\n", - " unique_adm1 <- unique(dhis2_routine_long$ADM1_ID)\n", - " adm1_to_keep <- unique_adm1[seq_len(min(DEV_SUBSET_ADM1_N, length(unique_adm1)))]\n", - " dhis2_routine_long <- dhis2_routine_long[ADM1_ID %in% adm1_to_keep]\n", - " log_msg(glue::glue(\"DEV_SUBSET enabled: keeping {length(adm1_to_keep)} ADM1 values\"), \"warning\")\n", - "}\n", - "\n", - "log_msg(glue::glue(\"Data loaded: {nrow(dhis2_routine_long)} rows, {length(unique(dhis2_routine_long$OU_ID))} facilities\"))\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " n_groups <- uniqueN(dhis2_routine_long[, .(OU_ID, INDICATOR)])\n", - " log_msg(glue::glue(\"Complete mode active: seasonal detection will run on up to {n_groups} OU_ID x INDICATOR time series.\"), \"warning\")\n", - "} else {\n", - " log_msg(\"Partial mode active: seasonal detection is skipped.\")\n", - "}" + "setup_ctx <- mg_input$setup_ctx\n", + "COUNTRY_CODE <- mg_input$country_code\n", + "fixed_cols <- mg_input$fixed_cols\n", + "indicators_to_keep <- mg_input$indicators_to_keep\n", + "dhis2_routine <- mg_input$dhis2_routine\n", + "dhis2_routine_long <- mg_input$dhis2_routine_long\n", + "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, { @@ -134,79 +87,10 @@ }, "outputs": [], "source": [ - "detect_outliers_mad_custom <- function(dt, deviation) {\n", - " flag_col <- paste0(\"OUTLIER_MAD\", deviation)\n", - " dt <- copy(dt)\n", - " dt[, median_val := median(VALUE, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)]\n", - " dt[, mad_val := mad(VALUE, constant = 1, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)]\n", - " dt[, (flag_col) := (VALUE > (median_val + deviation * mad_val)) | (VALUE < (median_val - deviation * mad_val))]\n", - " dt[is.na(get(flag_col)), (flag_col) := FALSE]\n", - " dt[, c(\"median_val\", \"mad_val\") := NULL]\n", - " dt\n", - "}\n", - "\n", - "detect_seasonal_outliers <- function(dt, deviation, workers = 1) {\n", - " outlier_col <- paste0(\"OUTLIER_SEASONAL\", deviation)\n", - " dt <- copy(dt)\n", - " setorder(dt, OU_ID, INDICATOR, PERIOD)\n", - "\n", - " process_group <- function(sub_dt) {\n", - " n_valid <- sum(!is.na(sub_dt$VALUE))\n", - " if (n_valid < 2) {\n", - " return(data.table(\n", - " PERIOD = sub_dt$PERIOD,\n", - " OU_ID = sub_dt$OU_ID,\n", - " INDICATOR = sub_dt$INDICATOR,\n", - " OUTLIER_FLAG = rep(FALSE, nrow(sub_dt))\n", - " ))\n", - " }\n", - "\n", - " values <- as.numeric(sub_dt$VALUE)\n", - " ts_data <- stats::ts(values, frequency = 12)\n", - " cleaned_ts <- tryCatch(\n", - " forecast::tsclean(ts_data, replace.missing = TRUE),\n", - " error = function(e) ts_data\n", - " )\n", - " mad_val <- mad(values, constant = 1, na.rm = TRUE)\n", - "\n", - " if (is.na(mad_val) || mad_val == 0) {\n", - " return(data.table(\n", - " PERIOD = sub_dt$PERIOD,\n", - " OU_ID = sub_dt$OU_ID,\n", - " INDICATOR = sub_dt$INDICATOR,\n", - " OUTLIER_FLAG = rep(FALSE, nrow(sub_dt))\n", - " ))\n", - " }\n", - "\n", - " is_outlier <- abs(as.numeric(ts_data) - as.numeric(cleaned_ts)) / mad_val >= deviation\n", - " is_outlier[is.na(is_outlier)] <- FALSE\n", - "\n", - " data.table(\n", - " PERIOD = sub_dt$PERIOD,\n", - " OU_ID = sub_dt$OU_ID,\n", - " INDICATOR = sub_dt$INDICATOR,\n", - " OUTLIER_FLAG = as.logical(is_outlier)\n", - " )\n", - " }\n", - "\n", - " group_keys <- unique(dt[, .(OU_ID, INDICATOR)])\n", - " group_list <- lapply(seq_len(nrow(group_keys)), function(i) {\n", - " dt[OU_ID == group_keys$OU_ID[i] & INDICATOR == group_keys$INDICATOR[i]]\n", - " })\n", - "\n", - " if (workers > 1 && requireNamespace(\"future.apply\", quietly = TRUE)) {\n", - " result_list <- future.apply::future_lapply(group_list, process_group, future.seed = TRUE)\n", - " } else {\n", - " result_list <- lapply(group_list, process_group)\n", - " }\n", - "\n", - " outlier_flags <- rbindlist(result_list, use.names = TRUE)\n", - " setnames(outlier_flags, \"OUTLIER_FLAG\", outlier_col)\n", - "\n", - " result_dt <- merge(dt, outlier_flags, by = c(\"PERIOD\", \"OU_ID\", \"INDICATOR\"), all.x = TRUE)\n", - " result_dt[is.na(get(outlier_col)), (outlier_col) := FALSE]\n", - " result_dt\n", - "}" + "# Helpers loaded from utils/snt_dhis2_outliers_imputation_magic_glasses.r\n", + "# - prepare_magic_glasses_input()\n", + "# - run_magic_glasses_outlier_detection()\n", + "# - export_magic_glasses_outputs()" ] }, { @@ -220,72 +104,18 @@ }, "outputs": [], "source": [ - "if (RUN_MAGIC_GLASSES_PARTIAL | RUN_MAGIC_GLASSES_COMPLETE) {\n", - " log_msg(\"Starting MAD15 detection...\")\n", - " flagged_outliers_mad15 <- detect_outliers_mad_custom(dhis2_routine_long, DEVIATION_MAD15)\n", - " flagged_outliers_mad15_filtered <- flagged_outliers_mad15[OUTLIER_MAD15 == FALSE]\n", - "\n", - " log_msg(\"Starting MAD10 detection...\")\n", - " flagged_outliers_mad10 <- detect_outliers_mad_custom(flagged_outliers_mad15_filtered, DEVIATION_MAD10)\n", - " setnames(flagged_outliers_mad10, paste0(\"OUTLIER_MAD\", DEVIATION_MAD10), \"OUTLIER_MAD15_MAD10\")\n", - "\n", - " join_cols <- c(\"PERIOD\", \"OU_ID\", \"INDICATOR\")\n", - " mad10_subset <- flagged_outliers_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)]\n", - " flagged_outliers_mad15_mad10 <- merge(\n", - " flagged_outliers_mad15,\n", - " mad10_subset,\n", - " by = join_cols,\n", - " all.x = TRUE\n", - " )\n", - " flagged_outliers_mad15_mad10[is.na(OUTLIER_MAD15_MAD10), OUTLIER_MAD15_MAD10 := TRUE]\n", - " log_msg(glue::glue(\"MAD partial done: {sum(flagged_outliers_mad15_mad10$OUTLIER_MAD15_MAD10)} outliers flagged\"))\n", - "}\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " flagged_outliers_mad15_mad10_filtered <- flagged_outliers_mad15_mad10[OUTLIER_MAD15_MAD10 == FALSE]\n", - "\n", - " if (nrow(flagged_outliers_mad15_mad10_filtered) == 0) {\n", - " log_msg(\"No rows left after MAD partial filtering; seasonal step will be skipped.\", \"warning\")\n", - " flagged_outliers_seasonal5 <- copy(flagged_outliers_mad15_mad10_filtered)\n", - " flagged_outliers_seasonal5[, OUTLIER_SEASONAL5 := FALSE]\n", - " flagged_outliers_seasonal5_filtered <- flagged_outliers_seasonal5\n", - " flagged_outliers_seasonal3 <- copy(flagged_outliers_seasonal5_filtered)\n", - " flagged_outliers_seasonal3[, OUTLIER_SEASONAL3 := FALSE]\n", - " } else {\n", - " log_msg(glue::glue(\"Starting SEASONAL5 detection on {nrow(flagged_outliers_mad15_mad10_filtered)} rows...\"))\n", - " t_seasonal5 <- system.time({\n", - " flagged_outliers_seasonal5 <- detect_seasonal_outliers(\n", - " flagged_outliers_mad15_mad10_filtered,\n", - " deviation = DEVIATION_SEASONAL5,\n", - " workers = SEASONAL_WORKERS\n", - " )\n", - " })\n", - " flagged_outliers_seasonal5_filtered <- flagged_outliers_seasonal5[OUTLIER_SEASONAL5 == FALSE]\n", - " log_msg(glue::glue(\"SEASONAL5 finished in {round(t_seasonal5['elapsed'], 1)}s. Remaining rows: {nrow(flagged_outliers_seasonal5_filtered)}\"))\n", - "\n", - " log_msg(glue::glue(\"Starting SEASONAL3 detection on {nrow(flagged_outliers_seasonal5_filtered)} rows...\"))\n", - " t_seasonal3 <- system.time({\n", - " flagged_outliers_seasonal3 <- detect_seasonal_outliers(\n", - " flagged_outliers_seasonal5_filtered,\n", - " deviation = DEVIATION_SEASONAL3,\n", - " workers = SEASONAL_WORKERS\n", - " )\n", - " })\n", - " log_msg(glue::glue(\"SEASONAL3 finished in {round(t_seasonal3['elapsed'], 1)}s.\"))\n", - " }\n", - "\n", - " setnames(flagged_outliers_seasonal3, paste0(\"OUTLIER_SEASONAL\", DEVIATION_SEASONAL3), \"OUTLIER_SEASONAL5_SEASONAL3\")\n", + "detection_result <- run_magic_glasses_outlier_detection(\n", + " dhis2_routine_long = dhis2_routine_long,\n", + " deviation_mad15 = DEVIATION_MAD15,\n", + " deviation_mad10 = DEVIATION_MAD10,\n", + " run_complete = RUN_MAGIC_GLASSES_COMPLETE,\n", + " deviation_seasonal5 = DEVIATION_SEASONAL5,\n", + " deviation_seasonal3 = DEVIATION_SEASONAL3,\n", + " seasonal_workers = SEASONAL_WORKERS\n", + ")\n", "\n", - " seasonal3_subset <- flagged_outliers_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)]\n", - " flagged_outliers_seasonal5_seasonal3 <- merge(\n", - " flagged_outliers_seasonal5,\n", - " seasonal3_subset,\n", - " by = join_cols,\n", - " all.x = TRUE\n", - " )\n", - " flagged_outliers_seasonal5_seasonal3[is.na(OUTLIER_SEASONAL5_SEASONAL3), OUTLIER_SEASONAL5_SEASONAL3 := TRUE]\n", - " log_msg(glue::glue(\"SEASONAL complete done: {sum(flagged_outliers_seasonal5_seasonal3$OUTLIER_SEASONAL5_SEASONAL3)} outliers flagged\"))\n", - "}" + "flagged_outliers_mad15_mad10 <- detection_result$flagged_outliers_mad15_mad10\n", + "flagged_outliers_seasonal5_seasonal3 <- detection_result$flagged_outliers_seasonal5_seasonal3" ] }, { @@ -299,117 +129,17 @@ }, "outputs": [], "source": [ - "base_cols <- intersect(c(fixed_cols, \"INDICATOR\", \"VALUE\"), names(dhis2_routine_long))\n", - "flagged_outliers_mg <- copy(dhis2_routine_long[, ..base_cols])\n", - "join_cols <- c(\"PERIOD\", \"OU_ID\", \"INDICATOR\")\n", - "\n", - "if (RUN_MAGIC_GLASSES_PARTIAL | RUN_MAGIC_GLASSES_COMPLETE) {\n", - " partial_subset <- flagged_outliers_mad15_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)]\n", - " flagged_outliers_mg <- merge(flagged_outliers_mg, partial_subset, by = join_cols, all.x = TRUE)\n", - " setnames(flagged_outliers_mg, \"OUTLIER_MAD15_MAD10\", \"OUTLIER_MAGIC_GLASSES_PARTIAL\")\n", - "}\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " complete_subset <- flagged_outliers_seasonal5_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)]\n", - " flagged_outliers_mg <- merge(flagged_outliers_mg, complete_subset, by = join_cols, all.x = TRUE)\n", - " setnames(flagged_outliers_mg, \"OUTLIER_SEASONAL5_SEASONAL3\", \"OUTLIER_MAGIC_GLASSES_COMPLETE\")\n", - " flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE) & OUTLIER_MAGIC_GLASSES_PARTIAL == TRUE,\n", - " OUTLIER_MAGIC_GLASSES_COMPLETE := TRUE]\n", - "}\n", - "\n", - "if (\"OUTLIER_MAGIC_GLASSES_PARTIAL\" %in% names(flagged_outliers_mg)) {\n", - " flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_PARTIAL), OUTLIER_MAGIC_GLASSES_PARTIAL := FALSE]\n", - "}\n", - "if (\"OUTLIER_MAGIC_GLASSES_COMPLETE\" %in% names(flagged_outliers_mg)) {\n", - " flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE), OUTLIER_MAGIC_GLASSES_COMPLETE := FALSE]\n", - "}\n", - "\n", - "active_outlier_col <- if (\n", - " RUN_MAGIC_GLASSES_COMPLETE && \"OUTLIER_MAGIC_GLASSES_COMPLETE\" %in% names(flagged_outliers_mg)\n", - ") {\n", - " \"OUTLIER_MAGIC_GLASSES_COMPLETE\"\n", - "} else {\n", - " \"OUTLIER_MAGIC_GLASSES_PARTIAL\"\n", - "}\n", - "\n", - "if (!(active_outlier_col %in% names(flagged_outliers_mg))) {\n", - " stop(glue::glue(\"Expected outlier flag column not found: {active_outlier_col}\"))\n", - "}\n", - "\n", - "pyramid_names <- unique(as.data.table(dhis2_routine)[, .(\n", - " ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, OU_ID, OU_NAME\n", - ")])\n", - "\n", - "# 1) Detected table: full routine data + OUTLIER_DETECTED flag (same structure as mean/median/iqr/path)\n", - "outlier_method_label <- if (active_outlier_col == \"OUTLIER_MAGIC_GLASSES_COMPLETE\") \"MAGIC_GLASSES_COMPLETE\" else \"MAGIC_GLASSES_PARTIAL\"\n", - "detected_tbl <- flagged_outliers_mg[, .(\n", - " PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE,\n", - " OUTLIER_DETECTED = get(active_outlier_col),\n", - " OUTLIER_METHOD = outlier_method_label\n", - ")]\n", - "detected_tbl[is.na(OUTLIER_DETECTED), OUTLIER_DETECTED := FALSE]\n", - "detected_tbl <- merge(detected_tbl, unique(pyramid_names), by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"), all.x = TRUE)\n", - "detected_tbl[, DATE := as.Date(sprintf(\"%04d-%02d-01\", YEAR, MONTH))]\n", - "arrow::write_parquet(detected_tbl, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_detected.parquet\")))\n", - "n_out <- sum(detected_tbl$OUTLIER_DETECTED == TRUE)\n", - "log_msg(glue::glue(\"Exported full detection table ({nrow(detected_tbl)} rows, {n_out} outliers) to {COUNTRY_CODE}_routine_outliers_detected.parquet\"))\n", - "\n", - "# Helper to restore routine dataset format (same structure as other outlier pipelines)\n", - "to_routine_wide <- function(dt_long) {\n", - " routine_wide <- dcast(\n", - " dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)],\n", - " PERIOD + YEAR + MONTH + ADM1_ID + ADM2_ID + OU_ID ~ INDICATOR,\n", - " value.var = \"VALUE\"\n", - " )\n", - "\n", - " routine_wide <- merge(routine_wide, unique(pyramid_names), by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"), all.x = TRUE)\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", indicators_to_keep)\n", - " for (col in setdiff(target_cols, names(routine_wide))) {\n", - " if (col %in% indicators_to_keep) {\n", - " routine_wide[, (col) := NA_real_]\n", - " } else if (col %in% c(\"YEAR\", \"MONTH\")) {\n", - " routine_wide[, (col) := NA_integer_]\n", - " } else {\n", - " routine_wide[, (col) := NA_character_]\n", - " }\n", - " }\n", - " cols_to_keep <- intersect(target_cols, names(routine_wide))\n", - " routine_wide <- routine_wide[, ..cols_to_keep]\n", - " routine_wide\n", - "}\n", - "\n", - "# 2) Imputed routine data (same moving-average logic as other outlier pipelines)\n", - "imputed_long <- copy(flagged_outliers_mg)\n", - "setorder(imputed_long, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH)\n", - "imputed_long[, TO_IMPUTE := fifelse(get(active_outlier_col) == TRUE, NA_real_, VALUE)]\n", - "imputed_long[\n", - " , MOVING_AVG := frollapply(\n", - " TO_IMPUTE,\n", - " n = 3,\n", - " FUN = function(x) ceiling(mean(x, na.rm = TRUE)),\n", - " align = \"center\"\n", - " ),\n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)\n", - "]\n", - "imputed_long[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)]\n", - "imputed_long[, VALUE := VALUE_IMPUTED]\n", - "imputed_long[, c(\"TO_IMPUTE\", \"MOVING_AVG\", \"VALUE_IMPUTED\") := NULL]\n", - "\n", - "routine_imputed <- to_routine_wide(imputed_long)\n", - "arrow::write_parquet(routine_imputed, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_imputed.parquet\")))\n", - "log_msg(glue::glue(\"Exported routine imputed table to {COUNTRY_CODE}_routine_outliers_imputed.parquet\"))\n", - "\n", - "# 3) Removed routine data\n", - "# We set outlier values to NA (we do not remove rows). The routine data keeps the same structure.\n", - "removed_long <- copy(flagged_outliers_mg)\n", - "removed_long[get(active_outlier_col) == TRUE, VALUE := NA_real_]\n", - "\n", - "routine_removed <- to_routine_wide(removed_long)\n", - "arrow::write_parquet(routine_removed, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_removed.parquet\")))\n", - "log_msg(glue::glue(\"Exported routine removed table to {COUNTRY_CODE}_routine_outliers_removed.parquet\"))\n", - "\n", - "log_msg(\"MG outlier tables exported successfully.\")" + "export_magic_glasses_outputs(\n", + " dhis2_routine_long = dhis2_routine_long,\n", + " flagged_outliers_mad15_mad10 = flagged_outliers_mad15_mad10,\n", + " flagged_outliers_seasonal5_seasonal3 = flagged_outliers_seasonal5_seasonal3,\n", + " run_complete = RUN_MAGIC_GLASSES_COMPLETE,\n", + " dhis2_routine = dhis2_routine,\n", + " fixed_cols = fixed_cols,\n", + " indicators_to_keep = indicators_to_keep,\n", + " output_dir = OUTPUT_DIR,\n", + " country_code = COUNTRY_CODE\n", + ")" ] } ], diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb index 4751905..cdeaf1a 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb @@ -28,18 +28,16 @@ "source": [ "# Setup\n", "ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\", \"dhis2\", \"outliers_imputation\")\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "install_and_load(c(\"jsonlite\", \"arrow\", \"glue\", \"reticulate\", \"dplyr\", \"ggplot2\", \"knitr\", \"scales\"))\n", - "\n", - "# Align logging init with other production notebooks\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_magic_glasses_report.r\"))\n", + "setup_ctx <- bootstrap_magic_glasses_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"jsonlite\", \"arrow\", \"glue\", \"reticulate\", \"dplyr\", \"ggplot2\", \"knitr\", \"scales\")\n", + ")\n", "\n", - "config_json <- fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", + "DATA_PATH <- setup_ctx$OUTPUT_DIR\n", + "config_json <- fromJSON(file.path(setup_ctx$CONFIG_PATH, \"SNT_config.json\"))\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", "\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r new file mode 100644 index 0000000..4aa5e57 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r @@ -0,0 +1,529 @@ +# Main helpers for magic glasses outliers imputation pipeline. + +#' Initialize runtime context for the Magic Glasses pipeline. +#' +#' Creates standard project paths, loads shared dependencies and utilities, +#' initializes OpenHEXA SDK access, and returns a context object consumed by +#' downstream setup and processing functions. +#' +#' @param root_path Project root folder (workspace). +#' @param required_packages Character vector of R packages to install/load. +#' @param load_openhexa Logical; import OpenHEXA SDK when TRUE. +#' @return Named list with paths and OpenHEXA handle. +bootstrap_magic_glasses_context <- function( + root_path = "~/workspace", + required_packages = c("arrow", "data.table", "jsonlite", "reticulate", "glue"), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + output_dir <- file.path(data_path, "dhis2", "outliers_imputation") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + + source(file.path(code_path, "snt_utils.r")) + install_and_load(unique(required_packages)) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + # snt_utils::log_msg() relies on a global `openhexa` object. + # Expose it before any helper function logs messages. + assign("openhexa", openhexa, envir = .GlobalEnv) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + OUTPUT_DIR = output_dir, + openhexa = openhexa + )) +} + +#' Load DHIS2 routine input data with validation and logging. +#' +#' Reads the latest routine parquet file from OpenHEXA, logs dataset details, +#' optionally casts YEAR and MONTH to integers, and validates indicator columns. +#' Stops execution with a clear error when required fields are missing. +#' +#' @param dataset_name OpenHEXA dataset identifier/name. +#' @param country_code Country code used in routine filename prefix. +#' @param required_indicators Optional character vector of required indicators. +#' @param cast_year_month Logical; cast YEAR/MONTH columns to integer. +#' @return Data frame containing validated routine data. +load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (cast_year_month && all(c("YEAR", "MONTH") %in% colnames(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + if (!is.null(required_indicators)) { + missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) + if (length(missing_indicators) > 0) { + msg <- paste("[ERROR] Missing indicator column(s) in routine data:", paste(missing_indicators, collapse = ", ")) + log_msg(msg) + stop(msg) + } + } + + dhis2_routine +} + +#' Detect point outliers using MAD thresholding. +#' +#' Computes median and MAD by YEAR, OU_ID and INDICATOR, then flags observations +#' outside median +/- deviation * MAD. +#' +#' @param dt Long-format routine data table. +#' @param deviation Numeric MAD multiplier. +#' @return Data table with method-specific outlier flag column. +detect_outliers_mad_custom <- function(dt, deviation) { + flag_col <- paste0("OUTLIER_MAD", deviation) + dt <- data.table::copy(dt) + dt[, median_val := median(VALUE, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] + dt[, mad_val := mad(VALUE, constant = 1, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] + dt[, (flag_col) := (VALUE > (median_val + deviation * mad_val)) | (VALUE < (median_val - deviation * mad_val))] + dt[is.na(get(flag_col)), (flag_col) := FALSE] + dt[, c("median_val", "mad_val") := NULL] + dt +} + +#' Detect seasonal outliers using cleaned time series residuals. +#' +#' Applies `forecast::tsclean` per OU/indicator series and flags observations +#' whose distance from cleaned signal exceeds a MAD-scaled deviation threshold. +#' Supports optional parallel execution when workers > 1. +#' +#' @param dt Long-format routine data table. +#' @param deviation Numeric threshold on scaled residuals. +#' @param workers Number of parallel workers for group processing. +#' @return Data table with seasonal outlier flag column. +detect_seasonal_outliers <- function(dt, deviation, workers = 1) { + outlier_col <- paste0("OUTLIER_SEASONAL", deviation) + dt <- data.table::copy(dt) + data.table::setorder(dt, OU_ID, INDICATOR, PERIOD) + + process_group <- function(sub_dt) { + n_valid <- sum(!is.na(sub_dt$VALUE)) + if (n_valid < 2) { + return(data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) + )) + } + + values <- as.numeric(sub_dt$VALUE) + ts_data <- stats::ts(values, frequency = 12) + cleaned_ts <- tryCatch( + forecast::tsclean(ts_data, replace.missing = TRUE), + error = function(e) ts_data + ) + mad_val <- mad(values, constant = 1, na.rm = TRUE) + + if (is.na(mad_val) || mad_val == 0) { + return(data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) + )) + } + + is_outlier <- abs(as.numeric(ts_data) - as.numeric(cleaned_ts)) / mad_val >= deviation + is_outlier[is.na(is_outlier)] <- FALSE + + data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = as.logical(is_outlier) + ) + } + + group_keys <- unique(dt[, .(OU_ID, INDICATOR)]) + group_list <- lapply(seq_len(nrow(group_keys)), function(i) { + dt[OU_ID == group_keys$OU_ID[i] & INDICATOR == group_keys$INDICATOR[i]] + }) + + if (workers > 1 && requireNamespace("future.apply", quietly = TRUE)) { + result_list <- future.apply::future_lapply(group_list, process_group, future.seed = TRUE) + } else { + result_list <- lapply(group_list, process_group) + } + + outlier_flags <- data.table::rbindlist(result_list, use.names = TRUE) + data.table::setnames(outlier_flags, "OUTLIER_FLAG", outlier_col) + + result_dt <- merge(dt, outlier_flags, by = c("PERIOD", "OU_ID", "INDICATOR"), all.x = TRUE) + result_dt[is.na(get(outlier_col)), (outlier_col) := FALSE] + result_dt +} + +#' Convert long routine data back to wide export format. +#' +#' Casts indicator rows to columns, joins administrative names, and guarantees +#' expected export columns exist with appropriate default types. +#' +#' @param dt_long Long-format routine data. +#' @param fixed_cols Fixed identifier/date columns. +#' @param indicators_to_keep Indicator columns expected in output. +#' @param pyramid_names Mapping table with ADM/OU names. +#' @return Wide routine data table ready for parquet export. +to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_names) { + routine_wide <- data.table::dcast( + dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)], + PERIOD + YEAR + MONTH + ADM1_ID + ADM2_ID + OU_ID ~ INDICATOR, + value.var = "VALUE" + ) + + routine_wide <- merge(routine_wide, unique(pyramid_names), by = c("ADM1_ID", "ADM2_ID", "OU_ID"), all.x = TRUE) + + target_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", indicators_to_keep) + for (col in setdiff(target_cols, names(routine_wide))) { + if (col %in% indicators_to_keep) { + routine_wide[, (col) := NA_real_] + } else if (col %in% c("YEAR", "MONTH")) { + routine_wide[, (col) := NA_integer_] + } else { + routine_wide[, (col) := NA_character_] + } + } + cols_to_keep <- intersect(target_cols, names(routine_wide)) + routine_wide <- routine_wide[, ..cols_to_keep] + routine_wide +} + +#' Prepare validated inputs for Magic Glasses detection. +#' +#' Bootstraps runtime context, loads configuration and routine input data, +#' validates required indicators, reshapes data to long format, deduplicates +#' keys, and optionally subsets data for development runs. +#' +#' @param root_path Project root folder (workspace). +#' @param config_file_name Configuration filename under configuration folder. +#' @param run_complete Logical; enable seasonal complete mode. +#' @param seasonal_workers Number of workers for seasonal detection. +#' @param dev_subset Logical; keep only a subset of ADM1 for development. +#' @param dev_subset_adm1_n Number of ADM1 values to keep in dev mode. +#' @return List with setup context, config variables and prepared data tables. +prepare_magic_glasses_input <- function( + root_path, + config_file_name = "SNT_config.json", + run_complete = FALSE, + seasonal_workers = 1, + dev_subset = FALSE, + dev_subset_adm1_n = 2 +) { + required_packages <- c("arrow", "data.table", "jsonlite", "reticulate", "glue") + if (run_complete) { + required_packages <- c(required_packages, "forecast") + } + if (run_complete && seasonal_workers > 1) { + required_packages <- c(required_packages, "future", "future.apply") + } + + setup_ctx <- bootstrap_magic_glasses_context( + root_path = root_path, + required_packages = required_packages + ) + + if (run_complete) { + log_msg("[WARNING] Complete mode: seasonal detection is very computationally intensive and can take several hours to run.", "warning") + } + + if (run_complete && seasonal_workers > 1) { + future::plan(future::multisession, workers = seasonal_workers) + log_msg(glue::glue("Using parallel seasonal detection with {seasonal_workers} workers")) + } + + config_json <- jsonlite::fromJSON(file.path(setup_ctx$CONFIG_PATH, config_file_name)) + + country_code <- config_json$SNT_CONFIG$COUNTRY_CODE + fixed_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID") + indicators_to_keep <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) + + dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED + dhis2_routine <- load_routine_data( + dataset_name = dataset_name, + country_code = country_code, + required_indicators = indicators_to_keep + ) + + cols_to_select <- intersect(c(fixed_cols, indicators_to_keep), names(dhis2_routine)) + dt_routine <- data.table::as.data.table(dhis2_routine)[, ..cols_to_select] + + dhis2_routine_long <- data.table::melt( + dt_routine, + id.vars = intersect(fixed_cols, names(dt_routine)), + measure.vars = intersect(indicators_to_keep, names(dt_routine)), + variable.name = "INDICATOR", + value.name = "VALUE", + variable.factor = FALSE + ) + + dup_keys <- c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "YEAR", "MONTH", "INDICATOR") + dup_keys <- intersect(dup_keys, names(dhis2_routine_long)) + if (length(dup_keys) > 0) { + duplicated <- dhis2_routine_long[, .N, by = dup_keys][N > 1L] + if (nrow(duplicated) > 0) { + log_msg(glue::glue("Removing {nrow(duplicated)} duplicated values.")) + data.table::setkeyv(dhis2_routine_long, dup_keys) + dhis2_routine_long <- unique(dhis2_routine_long) + } + } + + if (dev_subset) { + unique_adm1 <- unique(dhis2_routine_long$ADM1_ID) + adm1_to_keep <- unique_adm1[seq_len(min(dev_subset_adm1_n, length(unique_adm1)))] + dhis2_routine_long <- dhis2_routine_long[ADM1_ID %in% adm1_to_keep] + log_msg(glue::glue("DEV_SUBSET enabled: keeping {length(adm1_to_keep)} ADM1 values"), "warning") + } + + log_msg(glue::glue("Data loaded: {nrow(dhis2_routine_long)} rows, {length(unique(dhis2_routine_long$OU_ID))} facilities")) + + if (run_complete) { + n_groups <- data.table::uniqueN(dhis2_routine_long[, .(OU_ID, INDICATOR)]) + log_msg(glue::glue("Complete mode active: seasonal detection will run on up to {n_groups} OU_ID x INDICATOR time series."), "warning") + } else { + log_msg("Partial mode active: seasonal detection is skipped.") + } + + list( + setup_ctx = setup_ctx, + config_json = config_json, + country_code = country_code, + fixed_cols = fixed_cols, + indicators_to_keep = indicators_to_keep, + dhis2_routine = dhis2_routine, + dhis2_routine_long = dhis2_routine_long + ) +} + +#' Run Magic Glasses outlier detection workflow. +#' +#' Executes MAD15 then MAD10 detection, and optionally seasonal5 then seasonal3 +#' detection for complete mode, returning intermediate/final flag tables used by +#' export and reporting. +#' +#' @param dhis2_routine_long Long-format routine data. +#' @param deviation_mad15 MAD threshold for first pass. +#' @param deviation_mad10 MAD threshold for second pass. +#' @param run_complete Logical; run seasonal stages when TRUE. +#' @param deviation_seasonal5 Seasonal threshold for first seasonal pass. +#' @param deviation_seasonal3 Seasonal threshold for second seasonal pass. +#' @param seasonal_workers Number of workers for seasonal detection. +#' @return List with partial and complete outlier-flag tables. +run_magic_glasses_outlier_detection <- function( + dhis2_routine_long, + deviation_mad15 = 15, + deviation_mad10 = 10, + run_complete = FALSE, + deviation_seasonal5 = 5, + deviation_seasonal3 = 3, + seasonal_workers = 1 +) { + log_msg("Starting MAD15 detection...") + flagged_outliers_mad15 <- detect_outliers_mad_custom(dhis2_routine_long, deviation_mad15) + flagged_outliers_mad15_filtered <- flagged_outliers_mad15[OUTLIER_MAD15 == FALSE] + + log_msg("Starting MAD10 detection...") + flagged_outliers_mad10 <- detect_outliers_mad_custom(flagged_outliers_mad15_filtered, deviation_mad10) + data.table::setnames(flagged_outliers_mad10, paste0("OUTLIER_MAD", deviation_mad10), "OUTLIER_MAD15_MAD10") + + join_cols <- c("PERIOD", "OU_ID", "INDICATOR") + mad10_subset <- flagged_outliers_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)] + flagged_outliers_mad15_mad10 <- merge( + flagged_outliers_mad15, + mad10_subset, + by = join_cols, + all.x = TRUE + ) + flagged_outliers_mad15_mad10[is.na(OUTLIER_MAD15_MAD10), OUTLIER_MAD15_MAD10 := TRUE] + log_msg(glue::glue("MAD partial done: {sum(flagged_outliers_mad15_mad10$OUTLIER_MAD15_MAD10)} outliers flagged")) + + flagged_outliers_seasonal5_seasonal3 <- NULL + if (run_complete) { + flagged_outliers_mad15_mad10_filtered <- flagged_outliers_mad15_mad10[OUTLIER_MAD15_MAD10 == FALSE] + + if (nrow(flagged_outliers_mad15_mad10_filtered) == 0) { + log_msg("No rows left after MAD partial filtering; seasonal step will be skipped.", "warning") + flagged_outliers_seasonal5 <- data.table::copy(flagged_outliers_mad15_mad10_filtered) + flagged_outliers_seasonal5[, OUTLIER_SEASONAL5 := FALSE] + flagged_outliers_seasonal3 <- data.table::copy(flagged_outliers_seasonal5) + flagged_outliers_seasonal3[, OUTLIER_SEASONAL3 := FALSE] + } else { + log_msg(glue::glue("Starting SEASONAL5 detection on {nrow(flagged_outliers_mad15_mad10_filtered)} rows...")) + t_seasonal5 <- system.time({ + flagged_outliers_seasonal5 <- detect_seasonal_outliers( + flagged_outliers_mad15_mad10_filtered, + deviation = deviation_seasonal5, + workers = seasonal_workers + ) + }) + flagged_outliers_seasonal5_filtered <- flagged_outliers_seasonal5[OUTLIER_SEASONAL5 == FALSE] + log_msg(glue::glue("SEASONAL5 finished in {round(t_seasonal5['elapsed'], 1)}s. Remaining rows: {nrow(flagged_outliers_seasonal5_filtered)}")) + + log_msg(glue::glue("Starting SEASONAL3 detection on {nrow(flagged_outliers_seasonal5_filtered)} rows...")) + t_seasonal3 <- system.time({ + flagged_outliers_seasonal3 <- detect_seasonal_outliers( + flagged_outliers_seasonal5_filtered, + deviation = deviation_seasonal3, + workers = seasonal_workers + ) + }) + log_msg(glue::glue("SEASONAL3 finished in {round(t_seasonal3['elapsed'], 1)}s.")) + } + + data.table::setnames(flagged_outliers_seasonal3, paste0("OUTLIER_SEASONAL", deviation_seasonal3), "OUTLIER_SEASONAL5_SEASONAL3") + + seasonal3_subset <- flagged_outliers_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)] + flagged_outliers_seasonal5_seasonal3 <- merge( + flagged_outliers_seasonal5, + seasonal3_subset, + by = join_cols, + all.x = TRUE + ) + flagged_outliers_seasonal5_seasonal3[is.na(OUTLIER_SEASONAL5_SEASONAL3), OUTLIER_SEASONAL5_SEASONAL3 := TRUE] + log_msg(glue::glue("SEASONAL complete done: {sum(flagged_outliers_seasonal5_seasonal3$OUTLIER_SEASONAL5_SEASONAL3)} outliers flagged")) + } + + list( + flagged_outliers_mad15_mad10 = flagged_outliers_mad15_mad10, + flagged_outliers_seasonal5_seasonal3 = flagged_outliers_seasonal5_seasonal3 + ) +} + +#' Export Magic Glasses outputs for datasets and downstream use. +#' +#' Builds unified detection table, writes imputed and removed routine outputs, +#' and chooses partial or complete outlier flag depending on execution mode. +#' +#' @param dhis2_routine_long Long-format routine data used as base. +#' @param flagged_outliers_mad15_mad10 Partial detection output. +#' @param flagged_outliers_seasonal5_seasonal3 Complete detection output. +#' @param run_complete Logical; export complete method flags when available. +#' @param dhis2_routine Original wide routine table for name mapping. +#' @param fixed_cols Fixed identifier/date columns. +#' @param indicators_to_keep Indicator columns expected in outputs. +#' @param output_dir Output folder path. +#' @param country_code Country code used in output filenames. +#' @return Invisible list with selected active outlier column metadata. +export_magic_glasses_outputs <- function( + dhis2_routine_long, + flagged_outliers_mad15_mad10, + flagged_outliers_seasonal5_seasonal3, + run_complete, + dhis2_routine, + fixed_cols, + indicators_to_keep, + output_dir, + country_code +) { + base_cols <- intersect(c(fixed_cols, "INDICATOR", "VALUE"), names(dhis2_routine_long)) + flagged_outliers_mg <- data.table::copy(dhis2_routine_long[, ..base_cols]) + join_cols <- c("PERIOD", "OU_ID", "INDICATOR") + + partial_subset <- flagged_outliers_mad15_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)] + flagged_outliers_mg <- merge(flagged_outliers_mg, partial_subset, by = join_cols, all.x = TRUE) + data.table::setnames(flagged_outliers_mg, "OUTLIER_MAD15_MAD10", "OUTLIER_MAGIC_GLASSES_PARTIAL") + + if (run_complete && !is.null(flagged_outliers_seasonal5_seasonal3)) { + complete_subset <- flagged_outliers_seasonal5_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)] + flagged_outliers_mg <- merge(flagged_outliers_mg, complete_subset, by = join_cols, all.x = TRUE) + data.table::setnames(flagged_outliers_mg, "OUTLIER_SEASONAL5_SEASONAL3", "OUTLIER_MAGIC_GLASSES_COMPLETE") + flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE) & OUTLIER_MAGIC_GLASSES_PARTIAL == TRUE, OUTLIER_MAGIC_GLASSES_COMPLETE := TRUE] + } + + flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_PARTIAL), OUTLIER_MAGIC_GLASSES_PARTIAL := FALSE] + if ("OUTLIER_MAGIC_GLASSES_COMPLETE" %in% names(flagged_outliers_mg)) { + flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE), OUTLIER_MAGIC_GLASSES_COMPLETE := FALSE] + } + + active_outlier_col <- if (run_complete && "OUTLIER_MAGIC_GLASSES_COMPLETE" %in% names(flagged_outliers_mg)) { + "OUTLIER_MAGIC_GLASSES_COMPLETE" + } else { + "OUTLIER_MAGIC_GLASSES_PARTIAL" + } + + if (!(active_outlier_col %in% names(flagged_outliers_mg))) { + stop(glue::glue("Expected outlier flag column not found: {active_outlier_col}")) + } + + pyramid_names <- unique(data.table::as.data.table(dhis2_routine)[, .( + ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, OU_ID, OU_NAME + )]) + + outlier_method_label <- if (active_outlier_col == "OUTLIER_MAGIC_GLASSES_COMPLETE") "MAGIC_GLASSES_COMPLETE" else "MAGIC_GLASSES_PARTIAL" + detected_tbl <- flagged_outliers_mg[, .( + PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE, + OUTLIER_DETECTED = get(active_outlier_col), + OUTLIER_METHOD = outlier_method_label + )] + detected_tbl[is.na(OUTLIER_DETECTED), OUTLIER_DETECTED := FALSE] + detected_tbl <- merge(detected_tbl, unique(pyramid_names), by = c("ADM1_ID", "ADM2_ID", "OU_ID"), all.x = TRUE) + detected_tbl[, DATE := as.Date(sprintf("%04d-%02d-01", YEAR, MONTH))] + arrow::write_parquet(detected_tbl, file.path(output_dir, paste0(country_code, "_routine_outliers_detected.parquet"))) + n_out <- sum(detected_tbl$OUTLIER_DETECTED == TRUE) + log_msg(glue::glue("Exported full detection table ({nrow(detected_tbl)} rows, {n_out} outliers) to {country_code}_routine_outliers_detected.parquet")) + + imputed_long <- data.table::copy(flagged_outliers_mg) + data.table::setorder(imputed_long, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + imputed_long[, TO_IMPUTE := data.table::fifelse(get(active_outlier_col) == TRUE, NA_real_, VALUE)] + imputed_long[ + , + MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), + by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR) + ] + imputed_long[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + imputed_long[, VALUE := VALUE_IMPUTED] + imputed_long[, c("TO_IMPUTE", "MOVING_AVG", "VALUE_IMPUTED") := NULL] + + routine_imputed <- to_routine_wide( + dt_long = imputed_long, + fixed_cols = fixed_cols, + indicators_to_keep = indicators_to_keep, + pyramid_names = pyramid_names + ) + arrow::write_parquet(routine_imputed, file.path(output_dir, paste0(country_code, "_routine_outliers_imputed.parquet"))) + log_msg(glue::glue("Exported routine imputed table to {country_code}_routine_outliers_imputed.parquet")) + + removed_long <- data.table::copy(flagged_outliers_mg) + removed_long[get(active_outlier_col) == TRUE, VALUE := NA_real_] + + routine_removed <- to_routine_wide( + dt_long = removed_long, + fixed_cols = fixed_cols, + indicators_to_keep = indicators_to_keep, + pyramid_names = pyramid_names + ) + arrow::write_parquet(routine_removed, file.path(output_dir, paste0(country_code, "_routine_outliers_removed.parquet"))) + log_msg(glue::glue("Exported routine removed table to {country_code}_routine_outliers_removed.parquet")) + + log_msg("MG outlier tables exported successfully.") + invisible(list(active_outlier_col = active_outlier_col)) +} + diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r new file mode 100644 index 0000000..c3a9df6 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r @@ -0,0 +1,24 @@ +# Report helpers for magic glasses outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.candidate_files <- unique(c( + if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils", "snt_dhis2_outliers_imputation_magic_glasses.r") + } else { + character(0) + }, + if (!is.na(.this_file)) { + file.path(dirname(.this_file), "snt_dhis2_outliers_imputation_magic_glasses.r") + } else { + character(0) + }, + file.path(getwd(), "snt_dhis2_outliers_imputation_magic_glasses.r") +)) +.target_file <- .candidate_files[file.exists(.candidate_files)][1] +if (is.na(.target_file)) { + stop(paste0( + "Could not locate snt_dhis2_outliers_imputation_magic_glasses.r. Tried: ", + paste(.candidate_files, collapse = " | ") + )) +} +source(.target_file) + diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index b03e267..4ff4d3d 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -59,24 +59,16 @@ "source": [ "# Project folders (ROOT_PATH injected by pipeline if not set)\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_mean\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared helpers for this pipeline (code)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_mean.r\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c( \"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, { @@ -120,15 +112,9 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", + "log_msg(glue(\"SNT configuration loaded from : {file.path(setup_ctx$CONFIG_PATH, 'SNT_config.json')}\"))" ] }, { @@ -142,16 +128,7 @@ }, "outputs": [], "source": [ - "# Check SNT configuration \n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") \n", - "for (conf in snt_config_mandatory) {\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", + "# Configuration validation is handled in pipeline.py\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", @@ -190,15 +167,12 @@ "source": [ "# Load file from dataset (formatting)\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", + "dhis2_routine <- load_routine_data(\n", + " dataset_name = dataset_name,\n", + " country_code = COUNTRY_CODE,\n", + " required_indicators = DHIS2_INDICATORS\n", + ")\n", "\n", - "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", - "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", "head(dhis2_routine, 2)" ] @@ -214,8 +188,7 @@ }, "outputs": [], "source": [ - "# YEAR and MONTH should be integers; in the input data they are numeric, but we later use them as integers\n", - "dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)" + "# YEAR/MONTH casting is handled inside load_routine_data()." ] }, { @@ -237,14 +210,7 @@ }, "outputs": [], "source": [ - "# Raise an error if any of DHIS2_INDICATORS are not present in the dhis2 routine data.\n", - "for (ind in DHIS2_INDICATORS) {\n", - " if (!(ind %in% colnames(dhis2_routine))) {\n", - " msg <- paste(\"[ERROR] Missing indicator column in routine data: \", ind)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}" + "# Indicator validation is handled inside load_routine_data()." ] }, { @@ -533,23 +499,8 @@ }, "outputs": [], "source": [ - "# Define helper function to compute moving average for an outlier column\n", - "start_time <- Sys.time()\n", - "\n", - "impute_outliers_dt <- function(dt, outlier_col) {\n", - " dt <- as.data.table(dt) # transform to datatable\n", - " setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) \n", - " dt[, TO_IMPUTE := fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] # Compute TO_IMPUTE column\n", - " \n", - " # Fast rolling mean by group\n", - " dt[, MOVING_AVG := frollapply(TO_IMPUTE, n = 3, FUN = function(x) ceiling(mean(x, na.rm = TRUE)), align = \"center\"), \n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)]\n", - " \n", - " dt[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] \n", - " dt[, c(\"TO_IMPUTE\") := NULL] # clean up \"MOVING_AVG\"\n", - " \n", - " return(as.data.frame(copy(dt)))\n", - "}" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_mean.r\n", + "start_time <- Sys.time()" ] }, { @@ -626,24 +577,7 @@ }, "outputs": [], "source": [ - "# Define helper function to format both versions \n", - "format_routine_data_selection <- function(df, outlier_column, remove = FALSE) {\n", - " \n", - " # remove outliers \n", - " if (remove) df <- df %>% filter(!.data[[outlier_column]])\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", DHIS2_INDICATORS)\n", - " \n", - " output <- df %>%\n", - " select(-VALUE) %>%\n", - " rename(VALUE = VALUE_IMPUTED) %>%\n", - " select(all_of(fixed_cols), INDICATOR, VALUE) %>% # global: fixed_cols\n", - " mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>%\n", - " pivot_wider(names_from = \"INDICATOR\", values_from = \"VALUE\") %>%\n", - " left_join(pyramid_names, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"))\n", - "\n", - " output %>% select(all_of(intersect(target_cols, names(output))))\n", - "}" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_mean.r" ] }, { @@ -658,8 +592,22 @@ "outputs": [], "source": [ "# Format mean tables (imputed and removed)\n", - "dhis2_routine_mean_imputed <- format_routine_data_selection(dhis2_routine_outliers_mean_imputed, mean_column)\n", - "dhis2_routine_mean_removed <- format_routine_data_selection(dhis2_routine_outliers_mean_imputed, mean_column, remove = TRUE)" + "dhis2_routine_mean_imputed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_mean_imputed,\n", + " outlier_column = mean_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names\n", + ")\n", + "\n", + "dhis2_routine_mean_removed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_mean_imputed,\n", + " outlier_column = mean_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names,\n", + " remove = TRUE\n", + ")" ] }, { @@ -711,7 +659,7 @@ }, "outputs": [], "source": [ - "output_path <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", + "output_path <- OUTPUT_DIR\n", "\n", "# Mean detection table (for DB and reporting)\n", "outlier_col <- colnames(dhis2_routine_outliers_selection)[startsWith(colnames(dhis2_routine_outliers_selection), \"OUTLIER_\")][1]\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb index 9ced93f..020f9a7 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb @@ -35,22 +35,14 @@ "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_mean\")\n", "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Shared helpers for this pipeline (report)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_mean_report.r\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = SNT_ROOT_PATH,\n", + " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + ")" ] }, { @@ -64,13 +56,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "\n", "# Configuration variables\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", @@ -94,10 +81,7 @@ }, "outputs": [], "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_mean_report.r" ] }, { @@ -231,76 +215,9 @@ }, "outputs": [], "source": [ - "#--- FUNCTIONS TO MAKE ONE PLOT ---\n", - "plot_outliers <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>% filter(INDICATOR == ind_name)\n", - "\n", - " # Remove infinite or impossible values explicitly → removes warnings\n", - " df_ind <- df_ind %>% \n", - " filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE))\n", - "\n", - " p <- ggplot(df_ind, aes(x = YEAR, y = VALUE)) +\n", - " \n", - " # All values (grey)\n", - " geom_point(alpha = 0.25, color = \"grey40\", na.rm = TRUE) +\n", - " \n", - " # Outliers (red)\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " aes(x = YEAR, y = VALUE),\n", - " color = \"red\",\n", - " size = 2.8,\n", - " alpha = 0.85,\n", - " na.rm = TRUE\n", - " ) +\n", - " \n", - " labs(\n", - " title = paste(\"Inspection des valeurs aberrantes pour indicateur:\", ind_name),\n", - " subtitle = \"Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées\",\n", - " x = \"Année\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 14)\n", - "\n", - " return(p)\n", - "}\n", - "\n", - "#plots <- map(unique_inds, ~ plot_outliers(.x, routine_data, outlier_col))\n", - "#walk(plots, print)\n", - "\n", - "plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>%\n", - " filter(\n", - " INDICATOR == ind_name,\n", - " !is.na(YEAR),\n", - " !is.na(VALUE),\n", - " is.finite(VALUE)\n", - " )\n", - " \n", - " if (nrow(df_ind) == 0) return(NULL)\n", - " \n", - " ggplot(df_ind, aes(x = ADM2_ID, y = VALUE)) +\n", - " geom_point(color = \"grey60\", alpha = 0.3) +\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " color = \"red\", \n", - " size = 2.8,\n", - " alpha = 0.85\n", - " ) +\n", - " facet_wrap(~ YEAR, scales = \"free_y\") +\n", - " labs(\n", - " title = paste(\"Détection des valeurs aberrantes —\", ind_name),\n", - " subtitle = paste(\"Méthode :\", outlier_col, \"| Rouge = valeur aberrante\"),\n", - " x = \"District (ADM2)\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 75, hjust = 1, size = 7)\n", - " )\n", - "}" + "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_mean_report.r\n", + "# - plot_outliers()\n", + "# - plot_outliers_by_district_facet_year()" ] }, { @@ -556,24 +473,10 @@ }, "outputs": [], "source": [ - "# ---- 0. Define the checks, columns and labels ----\n", - "checks <- list(\n", - " allout_susp = c(\"ALLOUT\", \"SUSP\"), \n", - " allout_test = c(\"ALLOUT\", \"TEST\"), \n", - " susp_test = c(\"SUSP\", \"TEST\"), \n", - " test_conf = c(\"TEST\", \"CONF\"), \n", - " conf_treat = c(\"CONF\", \"MALTREAT\"), \n", - " adm_dth = c(\"MALADM\", \"MALDTH\") \n", - ")\n", - "\n", - "check_labels <- c(\n", - " pct_coherent_allout_susp = \"Ambulatoire ≥ Suspects\",\n", - " pct_coherent_allout_test = \"Ambulatoire ≥ Testés\",\n", - " pct_coherent_susp_test = \"Suspects ≥ Testés\",\n", - " pct_coherent_test_conf = \"Testés ≥ Confirmés\",\n", - " pct_coherent_conf_treat = \"Confirmés ≥ Traités\",\n", - " pct_coherent_adm_dth = \"Admissions Palu ≥ Décès Palu\"\n", - ")" + "# Coherence definitions loaded from utils/snt_dhis2_outliers_imputation_mean_report.r\n", + "defs <- get_coherence_definitions()\n", + "checks <- defs$checks\n", + "check_labels <- defs$check_labels" ] }, { @@ -587,83 +490,14 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency checks dynamically ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# ---- 2. Summarise percent coherent per year ----\n", - "check_cols <- intersect(paste0(\"check_\", names(checks)), names(df_checks))\n", - "\n", - "coherency_metrics <- df_checks %>%\n", - " group_by(YEAR) %>%\n", - " summarise(\n", - " across(all_of(check_cols), ~ mean(.x, na.rm = TRUE) * 100,\n", - " .names = \"pct_{.col}\"),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"check_type\",\n", - " names_prefix = \"pct_check_\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent)) %>% # <-- remove missing checks entirely\n", - " mutate(\n", - " check_label = recode(\n", - " check_type,\n", - " !!!setNames(check_labels, sub(\"^pct_coherent_\", \"\", names(check_labels)))\n", - " ),\n", - " check_label = factor(check_label, levels = unique(check_label)), # preserve only existing levels\n", - " check_label = fct_reorder(check_label, pct_coherent, .fun = median, na.rm = TRUE)\n", - " )\n", - "\n", - "# ---- 3. Heatmap ----\n", - "coherency_plot <- ggplot(coherency_metrics, aes(\n", - " x = factor(YEAR),\n", - " y = check_label,\n", - " fill = pct_coherent\n", - ")) +\n", - " geom_tile(color = NA, width = 0.88, height = 0.88) +\n", - " geom_text(\n", - " aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " color = \"white\",\n", - " fontface = \"bold\",\n", - " size = 5\n", - " ) +\n", - " scale_fill_viridis(\n", - " name = \"% Cohérent\",\n", - " option = \"viridis\",\n", - " limits = c(0, 100),\n", - " direction = -1\n", - " ) +\n", - " labs(\n", - " title = \"Contrôles de cohérence des données (niveau national)\",\n", - " x = \"Année\",\n", - " y = NULL\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " plot.title = element_text(size = 22, face = \"bold\", hjust = 0.5),\n", - " axis.text.y = element_text(size = 16, hjust = 0),\n", - " axis.text.x = element_text(size = 16),\n", - " legend.title = element_text(size = 16, face = \"bold\"),\n", - " legend.text = element_text(size = 14),\n", - " legend.key.width = unit(0.7, \"cm\"),\n", - " legend.key.height = unit(1.2, \"cm\")\n", - " )\n", + "# National coherence summary and plot via report utils\n", + "coherency_metrics <- compute_national_coherency_metrics(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels\n", + ")\n", "\n", + "coherency_plot <- plot_national_coherence_heatmap(coherency_metrics)\n", "coherency_plot" ] }, @@ -686,52 +520,16 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency check per row safely ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA_real_)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# Identify the check columns that actually exist\n", - "check_cols <- names(df_checks)[grepl(\"^check_\", names(df_checks))]\n", - "\n", - "valid_checks <- check_cols[\n", - " purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x)))\n", - "]\n", - "\n", - "# Compute coherence\n", - "adm_coherence <- df_checks %>%\n", - " group_by(ADM1_NAME, ADM2_NAME, ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " total_reports = n(),\n", - " !!!purrr::map(\n", - " valid_checks,\n", - " ~ expr(100 * mean(.data[[.x]], na.rm = TRUE))\n", - " ) %>%\n", - " setNames(paste0(\"pct_coherent_\", sub(\"^check_\", \"\", valid_checks))),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " filter(total_reports >= 5)\n", - "\n", - "# To long format\n", - "adm_long <- adm_coherence %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_coherent_\"),\n", - " names_to = \"check_type\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent))\n", + "# ADM coherence summaries via report utils\n", + "adm_coherence_data <- compute_adm_coherence_long(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels,\n", + " min_reports = 5\n", + ")\n", "\n", - "adm_long <- adm_long %>% mutate(check_label = recode(check_type, !!!check_labels))\n", + "adm_coherence <- adm_coherence_data$adm_coherence\n", + "adm_long <- adm_coherence_data$adm_long\n", "\n", "head(adm_long)" ] @@ -747,69 +545,7 @@ }, "outputs": [], "source": [ - "# Define heatmap function\n", - "plot_coherence_heatmap <- function(df, selected_year, agg_level = \"ADM1_NAME\", filename = NULL, do_plot = TRUE) {\n", - " \n", - " if (!agg_level %in% names(df)) {\n", - " stop(paste0(\"Aggregation level '\", agg_level, \"' not found in data!\"))\n", - " }\n", - " \n", - " # Aggregate pct_coherent by chosen level + check_label\n", - " df_year <- df %>%\n", - " filter(YEAR == selected_year) %>%\n", - " group_by(across(all_of(c(agg_level, \"check_label\")))) %>%\n", - " summarise(\n", - " pct_coherent = mean(pct_coherent, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " group_by(across(all_of(agg_level))) %>%\n", - " mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>%\n", - " ungroup() %>%\n", - " mutate(!!agg_level := fct_reorder(.data[[agg_level]], median_coh))\n", - " \n", - " n_units <- n_distinct(df_year[[agg_level]])\n", - " plot_height <- max(6, 0.5 * n_units) # dynamically adjust height\n", - " agg_label <- if (agg_level == \"ADM1_NAME\") {\n", - " \"niveau administratif 1\"\n", - " } else if (agg_level == \"ADM2_NAME\") {\n", - " \"niveau administratif 2\"\n", - " } else {\n", - " agg_level # fallback, in case a different level is passed\n", - " }\n", - " \n", - " p <- ggplot(df_year, aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) +\n", - " geom_tile(color = \"white\", linewidth = 0.2) +\n", - " geom_text(aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " size = 5, fontface = \"bold\", color = \"white\") +\n", - " scale_fill_viridis(name = \"% cohérent\", limits = c(0, 100),\n", - " option = \"viridis\", direction = -1) +\n", - " labs(\n", - " title = paste0(\"Cohérence des données par \", agg_label, \" - \", selected_year),\n", - " x = \"Règle de cohérence\",\n", - " y = agg_label\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " axis.text.y = element_text(size = 12),\n", - " axis.text.x = element_text(size = 12, angle = 30, hjust = 1),\n", - " plot.title = element_text(size = 16, face = \"bold\", hjust = 0.5),\n", - " legend.title = element_text(size = 12),\n", - " legend.text = element_text(size = 10)\n", - " )\n", - " \n", - " # Adjust notebook display\n", - " options(repr.plot.width = 14, repr.plot.height = plot_height)\n", - " \n", - " # Save if filename is provided\n", - " if (!is.null(filename)) {\n", - " ggsave(filename = filename, plot = p,\n", - " width = 14, height = plot_height, dpi = 300,\n", - " limitsize = FALSE)\n", - " }\n", - " if (do_plot) { print(p) }\n", - " # return(p)\n", - "}" + "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_mean_report.r" ] }, { @@ -876,43 +612,7 @@ }, "outputs": [], "source": [ - "# Define function\n", - "plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) {\n", - " \n", - " # Check if column exists\n", - " if (!col_name %in% names(map_data)) {\n", - " stop(paste0(\"Column '\", col_name, \"' not found in the data!\"))\n", - " }\n", - " \n", - " # Default legend title if not provided\n", - " if (is.null(indicator_label)) {\n", - " indicator_label <- col_name\n", - " }\n", - " \n", - " ggplot(map_data) +\n", - " geom_sf(aes(fill = .data[[col_name]]), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis(\n", - " name = paste0(\"% cohérence\\n(\", indicator_label, \")\"),\n", - " option = \"magma\",\n", - " direction = -1,\n", - " limits = c(0, 100),\n", - " na.value = \"grey90\"\n", - " ) +\n", - " # facet_wrap(~ YEAR) +\n", - " facet_wrap(~ YEAR, drop = TRUE) +\n", - " labs(\n", - " title = \"Cohérence des données par niveau administratif 2 et par année\",\n", - " subtitle = paste(\"Indicateur :\", indicator_label),\n", - " caption = \"Source : DHIS2 données routinières\"\n", - " ) +\n", - " theme_minimal(base_size = 15) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " strip.text = element_text(size = 14, face = \"bold\"),\n", - " plot.title = element_text(size = 20, face = \"bold\"),\n", - " legend.position = \"right\"\n", - " )\n", - "}\n" + "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_mean_report.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r new file mode 100644 index 0000000..75d59ed --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r @@ -0,0 +1,169 @@ +# Main helpers for mean outliers imputation pipeline. +# +# Function docs use lightweight R comments to keep notebooks readable +# while documenting expected inputs/outputs for analysts and maintainers. + +#' Initialize runtime context for the mean outliers pipeline. +#' +#' Creates standard project paths, loads shared dependencies and utilities, +#' initializes OpenHEXA SDK access, loads SNT configuration, and returns a +#' single context object used by notebooks. +#' +#' @param root_path Project root folder (workspace). +#' @param required_packages Character vector of R packages to install/load. +#' @param load_openhexa Logical; import OpenHEXA SDK when TRUE. +#' @return Named list with paths, OpenHEXA handle, and parsed config. +bootstrap_outliers_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + output_dir <- file.path(data_path, "dhis2", "outliers_imputation") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + # snt_utils::log_msg() expects a global `openhexa` object. + assign("openhexa", openhexa, envir = .GlobalEnv) + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + OUTPUT_DIR = output_dir, + openhexa = openhexa, + config_json = config_json + )) +} + +#' Load DHIS2 routine input data with validation and logging. +#' +#' Reads the latest routine parquet file from OpenHEXA, logs dataset details, +#' optionally casts YEAR and MONTH to integers, and validates indicator columns. +#' Stops execution with a clear error when required fields are missing. +#' +#' @param dataset_name OpenHEXA dataset identifier/name. +#' @param country_code Country code used in routine filename prefix. +#' @param required_indicators Optional character vector of required indicators. +#' @param cast_year_month Logical; cast YEAR/MONTH columns to integer. +#' @return Data frame containing validated routine data. +load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (cast_year_month && all(c("YEAR", "MONTH") %in% colnames(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + if (!is.null(required_indicators)) { + missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) + if (length(missing_indicators) > 0) { + msg <- paste("[ERROR] Missing indicator column(s) in routine data:", paste(missing_indicators, collapse = ", ")) + log_msg(msg) + stop(msg) + } + } + + dhis2_routine +} + +#' Impute flagged outliers using a centered moving average. +#' +#' For each ADM/OU/indicator time series, values marked as outliers are +#' replaced by a 3-point centered moving average (ceiling), preserving +#' non-outlier observations. +#' +#' @param dt Routine data in long format. +#' @param outlier_col Name of the logical outlier flag column. +#' @return Data frame with VALUE_IMPUTED column and helper columns removed. +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + dt[, MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + dt[, c("TO_IMPUTE") := NULL] + return(as.data.frame(data.table::copy(dt))) +} + +#' Build final routine output tables (imputed or removed). +#' +#' Reshapes long-format routine values back to wide indicator columns, joins +#' location names, and standardizes output columns expected by downstream +#' datasets and reporting. +#' +#' @param df Long-format routine data including VALUE_IMPUTED. +#' @param outlier_column Outlier flag column used to filter removed records. +#' @param DHIS2_INDICATORS Indicator columns to keep in the final table. +#' @param fixed_cols Fixed identifier/date columns in long format. +#' @param pyramid_names Mapping table with ADM/OU names. +#' @param remove Logical; when TRUE returns outlier-removed data. +#' @return Wide routine data frame ready for export. +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} + diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r new file mode 100644 index 0000000..ce792ef --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r @@ -0,0 +1,312 @@ +# Report helpers for mean outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.candidate_files <- unique(c( + if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils", "snt_dhis2_outliers_imputation_mean.r") + } else { + character(0) + }, + if (!is.na(.this_file)) { + file.path(dirname(.this_file), "snt_dhis2_outliers_imputation_mean.r") + } else { + character(0) + }, + file.path(getwd(), "snt_dhis2_outliers_imputation_mean.r") +)) +.target_file <- .candidate_files[file.exists(.candidate_files)][1] +if (is.na(.target_file)) { + stop(paste0( + "Could not locate snt_dhis2_outliers_imputation_mean.r. Tried: ", + paste(.candidate_files, collapse = " | ") + )) +} +source(.target_file) + +`%||%` <- function(x, y) if (!is.null(x)) x else y + +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Outliers for indicator:", ind_name), + subtitle = "Grey = all values, red = detected outliers", + x = "Year", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 14) +} + +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + if (nrow(df_ind) == 0) { + return(NULL) + } + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Outliers by district and year:", ind_name), + x = "District", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 12) +} + +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!all(c("YEAR", "check_label", "pct_coherent") %in% names(df))) return(NULL) + if (!agg_level %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate(YEAR = as.integer(.data$YEAR)) %>% + dplyr::filter(.data$YEAR == as.integer(selected_year)) %>% + dplyr::mutate( + agg = as.character(.data[[agg_level]]), + check_label = as.character(.data$check_label) + ) + + if (nrow(d) == 0) return(NULL) + + p <- ggplot2::ggplot(d, ggplot2::aes( + x = .data$check_label, + y = .data$agg, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile() + + ggplot2::scale_fill_viridis_c( + name = "% coherent", + option = "viridis", + limits = c(0, 100) + ) + + ggplot2::labs( + title = sprintf("Coherence (%s) - %s", agg_level, selected_year), + x = NULL, + y = NULL + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), + plot.title = ggplot2::element_text(face = "bold") + ) + + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = 8, dpi = 150) + } + + if (do_plot) print(p) + invisible(p) +} + +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!inherits(map_data, "sf")) return(NULL) + if (!col_name %in% names(map_data)) return(NULL) + + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = NA) + + ggplot2::scale_fill_viridis_c( + option = "viridis", + name = indicator_label %||% col_name, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::labs(title = indicator_label %||% col_name) + + ggplot2::theme_void(base_size = 12) + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", hjust = 0.5), + legend.position = "right" + ) +} + +get_coherence_definitions <- function() { + checks <- list( + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") + ) + + check_labels <- c( + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" + ) + + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + if (length(check_cols) == 0) { + return(tibble::tibble( + YEAR = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = factor() + )) + } + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) + ) +} + +plot_national_coherence_heatmap <- function(coherency_metrics) { + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 + ) + + ggplot2::labs( + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") + ) +} + +compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + if (length(valid_checks) == 0) { + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise(total_reports = dplyr::n(), .groups = "drop") %>% + dplyr::filter(.data$total_reports >= min_reports) + adm_long <- tibble::tibble( + ADM1_NAME = character(), + ADM2_NAME = character(), + ADM2_ID = character(), + YEAR = integer(), + total_reports = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = character() + ) + return(list(adm_coherence = adm_coherence, adm_long = adm_long)) + } + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) + ) %>% + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb index 90275b5..9b6a64a 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb @@ -59,24 +59,16 @@ "source": [ "# Project folders (ROOT_PATH injected by pipeline if not set)\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_median\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared helpers for this pipeline (code)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_median.r\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c( \"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, { @@ -120,15 +112,9 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", + "log_msg(glue(\"SNT configuration loaded from : {file.path(setup_ctx$CONFIG_PATH, 'SNT_config.json')}\"))" ] }, { @@ -142,16 +128,7 @@ }, "outputs": [], "source": [ - "# Check SNT configuration \n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") \n", - "for (conf in snt_config_mandatory) {\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", + "# Configuration validation is handled in pipeline.py\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", @@ -190,15 +167,12 @@ "source": [ "# Load file from dataset (formatting)\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", + "dhis2_routine <- load_routine_data(\n", + " dataset_name = dataset_name,\n", + " country_code = COUNTRY_CODE,\n", + " required_indicators = DHIS2_INDICATORS\n", + ")\n", "\n", - "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", - "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", "head(dhis2_routine, 2)" ] @@ -214,8 +188,7 @@ }, "outputs": [], "source": [ - "# YEAR and MONTH should be integers; in the input data they are numeric, but we later use them as integers\n", - "dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)" + "# YEAR/MONTH casting is handled inside load_routine_data()." ] }, { @@ -237,14 +210,7 @@ }, "outputs": [], "source": [ - "# Raise an error if any of DHIS2_INDICATORS are not present in the dhis2 routine data.\n", - "for (ind in DHIS2_INDICATORS) {\n", - " if (!(ind %in% colnames(dhis2_routine))) {\n", - " msg <- paste(\"[ERROR] Missing indicator column in routine data: \", ind)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}" + "# Indicator validation is handled inside load_routine_data()." ] }, { @@ -533,23 +499,8 @@ }, "outputs": [], "source": [ - "# Define helper function to compute moving average for an outlier column\n", - "start_time <- Sys.time()\n", - "\n", - "impute_outliers_dt <- function(dt, outlier_col) {\n", - " dt <- as.data.table(dt) # transform to datatable\n", - " setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) \n", - " dt[, TO_IMPUTE := fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] # Compute TO_IMPUTE column\n", - " \n", - " # Fast rolling mean by group\n", - " dt[, MOVING_AVG := frollapply(TO_IMPUTE, n = 3, FUN = function(x) ceiling(mean(x, na.rm = TRUE)), align = \"center\"), \n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)]\n", - " \n", - " dt[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] \n", - " dt[, c(\"TO_IMPUTE\") := NULL] # clean up \"MOVING_AVG\"\n", - " \n", - " return(as.data.frame(copy(dt)))\n", - "}" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_median.r\n", + "start_time <- Sys.time()" ] }, { @@ -626,24 +577,7 @@ }, "outputs": [], "source": [ - "# Define helper function to format both versions \n", - "format_routine_data_selection <- function(df, outlier_column, remove = FALSE) {\n", - " \n", - " # remove outliers \n", - " if (remove) df <- df %>% filter(!.data[[outlier_column]])\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", DHIS2_INDICATORS)\n", - " \n", - " output <- df %>%\n", - " select(-VALUE) %>%\n", - " rename(VALUE = VALUE_IMPUTED) %>%\n", - " select(all_of(fixed_cols), INDICATOR, VALUE) %>% # global: fixed_cols\n", - " mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>%\n", - " pivot_wider(names_from = \"INDICATOR\", values_from = \"VALUE\") %>%\n", - " left_join(pyramid_names, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"))\n", - "\n", - " output %>% select(all_of(intersect(target_cols, names(output))))\n", - "}" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_median.r" ] }, { @@ -658,8 +592,22 @@ "outputs": [], "source": [ "# Format median tables (imputed and removed)\n", - "dhis2_routine_median_imputed <- format_routine_data_selection(dhis2_routine_outliers_median_imputed, median_column)\n", - "dhis2_routine_median_removed <- format_routine_data_selection(dhis2_routine_outliers_median_imputed, median_column, remove = TRUE)" + "dhis2_routine_median_imputed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_median_imputed,\n", + " outlier_column = median_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names\n", + ")\n", + "\n", + "dhis2_routine_median_removed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_median_imputed,\n", + " outlier_column = median_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names,\n", + " remove = TRUE\n", + ")" ] }, { @@ -711,7 +659,7 @@ }, "outputs": [], "source": [ - "output_path <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", + "output_path <- OUTPUT_DIR\n", "\n", "# Median detection table (for DB and reporting)\n", "outlier_col <- colnames(dhis2_routine_outliers_selection)[startsWith(colnames(dhis2_routine_outliers_selection), \"OUTLIER_\")][1]\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb index 4705ad1..18bf856 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb @@ -35,22 +35,14 @@ "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_median\")\n", "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Shared helpers for this pipeline (report)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_median_report.r\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = SNT_ROOT_PATH,\n", + " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + ")" ] }, { @@ -64,13 +56,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "\n", "# Configuration variables\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", @@ -94,10 +81,7 @@ }, "outputs": [], "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_median_report.r" ] }, { @@ -231,76 +215,9 @@ }, "outputs": [], "source": [ - "#--- FUNCTIONS TO MAKE ONE PLOT ---\n", - "plot_outliers <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>% filter(INDICATOR == ind_name)\n", - "\n", - " # Remove infinite or impossible values explicitly → removes warnings\n", - " df_ind <- df_ind %>% \n", - " filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE))\n", - "\n", - " p <- ggplot(df_ind, aes(x = YEAR, y = VALUE)) +\n", - " \n", - " # All values (grey)\n", - " geom_point(alpha = 0.25, color = \"grey40\", na.rm = TRUE) +\n", - " \n", - " # Outliers (red)\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " aes(x = YEAR, y = VALUE),\n", - " color = \"red\",\n", - " size = 2.8,\n", - " alpha = 0.85,\n", - " na.rm = TRUE\n", - " ) +\n", - " \n", - " labs(\n", - " title = paste(\"Inspection des valeurs aberrantes pour indicateur:\", ind_name),\n", - " subtitle = \"Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées\",\n", - " x = \"Année\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 14)\n", - "\n", - " return(p)\n", - "}\n", - "\n", - "#plots <- map(unique_inds, ~ plot_outliers(.x, routine_data, outlier_col))\n", - "#walk(plots, print)\n", - "\n", - "plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>%\n", - " filter(\n", - " INDICATOR == ind_name,\n", - " !is.na(YEAR),\n", - " !is.na(VALUE),\n", - " is.finite(VALUE)\n", - " )\n", - " \n", - " if (nrow(df_ind) == 0) return(NULL)\n", - " \n", - " ggplot(df_ind, aes(x = ADM2_ID, y = VALUE)) +\n", - " geom_point(color = \"grey60\", alpha = 0.3) +\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " color = \"red\", \n", - " size = 2.8,\n", - " alpha = 0.85\n", - " ) +\n", - " facet_wrap(~ YEAR, scales = \"free_y\") +\n", - " labs(\n", - " title = paste(\"Détection des valeurs aberrantes —\", ind_name),\n", - " subtitle = paste(\"Méthode :\", outlier_col, \"| Rouge = valeur aberrante\"),\n", - " x = \"District (ADM2)\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 75, hjust = 1, size = 7)\n", - " )\n", - "}" + "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_median_report.r\n", + "# - plot_outliers()\n", + "# - plot_outliers_by_district_facet_year()" ] }, { @@ -556,24 +473,10 @@ }, "outputs": [], "source": [ - "# ---- 0. Define the checks, columns and labels ----\n", - "checks <- list(\n", - " allout_susp = c(\"ALLOUT\", \"SUSP\"), \n", - " allout_test = c(\"ALLOUT\", \"TEST\"), \n", - " susp_test = c(\"SUSP\", \"TEST\"), \n", - " test_conf = c(\"TEST\", \"CONF\"), \n", - " conf_treat = c(\"CONF\", \"MALTREAT\"), \n", - " adm_dth = c(\"MALADM\", \"MALDTH\") \n", - ")\n", - "\n", - "check_labels <- c(\n", - " pct_coherent_allout_susp = \"Ambulatoire ≥ Suspects\",\n", - " pct_coherent_allout_test = \"Ambulatoire ≥ Testés\",\n", - " pct_coherent_susp_test = \"Suspects ≥ Testés\",\n", - " pct_coherent_test_conf = \"Testés ≥ Confirmés\",\n", - " pct_coherent_conf_treat = \"Confirmés ≥ Traités\",\n", - " pct_coherent_adm_dth = \"Admissions Palu ≥ Décès Palu\"\n", - ")" + "# Coherence definitions loaded from utils/snt_dhis2_outliers_imputation_median_report.r\n", + "defs <- get_coherence_definitions()\n", + "checks <- defs$checks\n", + "check_labels <- defs$check_labels" ] }, { @@ -587,83 +490,14 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency checks dynamically ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# ---- 2. Summarise percent coherent per year ----\n", - "check_cols <- intersect(paste0(\"check_\", names(checks)), names(df_checks))\n", - "\n", - "coherency_metrics <- df_checks %>%\n", - " group_by(YEAR) %>%\n", - " summarise(\n", - " across(all_of(check_cols), ~ mean(.x, na.rm = TRUE) * 100,\n", - " .names = \"pct_{.col}\"),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"check_type\",\n", - " names_prefix = \"pct_check_\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent)) %>% # <-- remove missing checks entirely\n", - " mutate(\n", - " check_label = recode(\n", - " check_type,\n", - " !!!setNames(check_labels, sub(\"^pct_coherent_\", \"\", names(check_labels)))\n", - " ),\n", - " check_label = factor(check_label, levels = unique(check_label)), # preserve only existing levels\n", - " check_label = fct_reorder(check_label, pct_coherent, .fun = median, na.rm = TRUE)\n", - " )\n", - "\n", - "# ---- 3. Heatmap ----\n", - "coherency_plot <- ggplot(coherency_metrics, aes(\n", - " x = factor(YEAR),\n", - " y = check_label,\n", - " fill = pct_coherent\n", - ")) +\n", - " geom_tile(color = NA, width = 0.88, height = 0.88) +\n", - " geom_text(\n", - " aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " color = \"white\",\n", - " fontface = \"bold\",\n", - " size = 5\n", - " ) +\n", - " scale_fill_viridis(\n", - " name = \"% Cohérent\",\n", - " option = \"viridis\",\n", - " limits = c(0, 100),\n", - " direction = -1\n", - " ) +\n", - " labs(\n", - " title = \"Contrôles de cohérence des données (niveau national)\",\n", - " x = \"Année\",\n", - " y = NULL\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " plot.title = element_text(size = 22, face = \"bold\", hjust = 0.5),\n", - " axis.text.y = element_text(size = 16, hjust = 0),\n", - " axis.text.x = element_text(size = 16),\n", - " legend.title = element_text(size = 16, face = \"bold\"),\n", - " legend.text = element_text(size = 14),\n", - " legend.key.width = unit(0.7, \"cm\"),\n", - " legend.key.height = unit(1.2, \"cm\")\n", - " )\n", + "# National coherence summary and plot via report utils\n", + "coherency_metrics <- compute_national_coherency_metrics(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels\n", + ")\n", "\n", + "coherency_plot <- plot_national_coherence_heatmap(coherency_metrics)\n", "coherency_plot" ] }, @@ -686,52 +520,16 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency check per row safely ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA_real_)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# Identify the check columns that actually exist\n", - "check_cols <- names(df_checks)[grepl(\"^check_\", names(df_checks))]\n", - "\n", - "valid_checks <- check_cols[\n", - " purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x)))\n", - "]\n", - "\n", - "# Compute coherence\n", - "adm_coherence <- df_checks %>%\n", - " group_by(ADM1_NAME, ADM2_NAME, ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " total_reports = n(),\n", - " !!!purrr::map(\n", - " valid_checks,\n", - " ~ expr(100 * mean(.data[[.x]], na.rm = TRUE))\n", - " ) %>%\n", - " setNames(paste0(\"pct_coherent_\", sub(\"^check_\", \"\", valid_checks))),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " filter(total_reports >= 5)\n", - "\n", - "# To long format\n", - "adm_long <- adm_coherence %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_coherent_\"),\n", - " names_to = \"check_type\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent))\n", + "# ADM coherence summaries via report utils\n", + "adm_coherence_data <- compute_adm_coherence_long(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels,\n", + " min_reports = 5\n", + ")\n", "\n", - "adm_long <- adm_long %>% mutate(check_label = recode(check_type, !!!check_labels))\n", + "adm_coherence <- adm_coherence_data$adm_coherence\n", + "adm_long <- adm_coherence_data$adm_long\n", "\n", "head(adm_long)" ] @@ -747,69 +545,7 @@ }, "outputs": [], "source": [ - "# Define heatmap function\n", - "plot_coherence_heatmap <- function(df, selected_year, agg_level = \"ADM1_NAME\", filename = NULL, do_plot = TRUE) {\n", - " \n", - " if (!agg_level %in% names(df)) {\n", - " stop(paste0(\"Aggregation level '\", agg_level, \"' not found in data!\"))\n", - " }\n", - " \n", - " # Aggregate pct_coherent by chosen level + check_label\n", - " df_year <- df %>%\n", - " filter(YEAR == selected_year) %>%\n", - " group_by(across(all_of(c(agg_level, \"check_label\")))) %>%\n", - " summarise(\n", - " pct_coherent = mean(pct_coherent, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " group_by(across(all_of(agg_level))) %>%\n", - " mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>%\n", - " ungroup() %>%\n", - " mutate(!!agg_level := fct_reorder(.data[[agg_level]], median_coh))\n", - " \n", - " n_units <- n_distinct(df_year[[agg_level]])\n", - " plot_height <- max(6, 0.5 * n_units) # dynamically adjust height\n", - " agg_label <- if (agg_level == \"ADM1_NAME\") {\n", - " \"niveau administratif 1\"\n", - " } else if (agg_level == \"ADM2_NAME\") {\n", - " \"niveau administratif 2\"\n", - " } else {\n", - " agg_level # fallback, in case a different level is passed\n", - " }\n", - " \n", - " p <- ggplot(df_year, aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) +\n", - " geom_tile(color = \"white\", linewidth = 0.2) +\n", - " geom_text(aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " size = 5, fontface = \"bold\", color = \"white\") +\n", - " scale_fill_viridis(name = \"% cohérent\", limits = c(0, 100),\n", - " option = \"viridis\", direction = -1) +\n", - " labs(\n", - " title = paste0(\"Cohérence des données par \", agg_label, \" - \", selected_year),\n", - " x = \"Règle de cohérence\",\n", - " y = agg_label\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " axis.text.y = element_text(size = 12),\n", - " axis.text.x = element_text(size = 12, angle = 30, hjust = 1),\n", - " plot.title = element_text(size = 16, face = \"bold\", hjust = 0.5),\n", - " legend.title = element_text(size = 12),\n", - " legend.text = element_text(size = 10)\n", - " )\n", - " \n", - " # Adjust notebook display\n", - " options(repr.plot.width = 14, repr.plot.height = plot_height)\n", - " \n", - " # Save if filename is provided\n", - " if (!is.null(filename)) {\n", - " ggsave(filename = filename, plot = p,\n", - " width = 14, height = plot_height, dpi = 300,\n", - " limitsize = FALSE)\n", - " }\n", - " if (do_plot) { print(p) }\n", - " # return(p)\n", - "}" + "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_median_report.r" ] }, { @@ -876,43 +612,7 @@ }, "outputs": [], "source": [ - "# Define function\n", - "plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) {\n", - " \n", - " # Check if column exists\n", - " if (!col_name %in% names(map_data)) {\n", - " stop(paste0(\"Column '\", col_name, \"' not found in the data!\"))\n", - " }\n", - " \n", - " # Default legend title if not provided\n", - " if (is.null(indicator_label)) {\n", - " indicator_label <- col_name\n", - " }\n", - " \n", - " ggplot(map_data) +\n", - " geom_sf(aes(fill = .data[[col_name]]), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis(\n", - " name = paste0(\"% cohérence\\n(\", indicator_label, \")\"),\n", - " option = \"magma\",\n", - " direction = -1,\n", - " limits = c(0, 100),\n", - " na.value = \"grey90\"\n", - " ) +\n", - " # facet_wrap(~ YEAR) +\n", - " facet_wrap(~ YEAR, drop = TRUE) +\n", - " labs(\n", - " title = \"Cohérence des données par niveau administratif 2 et par année\",\n", - " subtitle = paste(\"Indicateur :\", indicator_label),\n", - " caption = \"Source : DHIS2 données routinières\"\n", - " ) +\n", - " theme_minimal(base_size = 15) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " strip.text = element_text(size = 14, face = \"bold\"),\n", - " plot.title = element_text(size = 20, face = \"bold\"),\n", - " legend.position = \"right\"\n", - " )\n", - "}\n" + "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_median_report.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r new file mode 100644 index 0000000..272ee05 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r @@ -0,0 +1,169 @@ +# Main helpers for median outliers imputation pipeline. +# +# Function docs use lightweight R comments to keep notebooks readable +# while documenting expected inputs/outputs for analysts and maintainers. + +#' Initialize runtime context for the median outliers pipeline. +#' +#' Creates standard project paths, loads shared dependencies and utilities, +#' initializes OpenHEXA SDK access, loads SNT configuration, and returns a +#' single context object used by notebooks. +#' +#' @param root_path Project root folder (workspace). +#' @param required_packages Character vector of R packages to install/load. +#' @param load_openhexa Logical; import OpenHEXA SDK when TRUE. +#' @return Named list with paths, OpenHEXA handle, and parsed config. +bootstrap_outliers_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + output_dir <- file.path(data_path, "dhis2", "outliers_imputation") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + # snt_utils::log_msg() expects a global `openhexa` object. + assign("openhexa", openhexa, envir = .GlobalEnv) + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + OUTPUT_DIR = output_dir, + openhexa = openhexa, + config_json = config_json + )) +} + +#' Load DHIS2 routine input data with validation and logging. +#' +#' Reads the latest routine parquet file from OpenHEXA, logs dataset details, +#' optionally casts YEAR and MONTH to integers, and validates indicator columns. +#' Stops execution with a clear error when required fields are missing. +#' +#' @param dataset_name OpenHEXA dataset identifier/name. +#' @param country_code Country code used in routine filename prefix. +#' @param required_indicators Optional character vector of required indicators. +#' @param cast_year_month Logical; cast YEAR/MONTH columns to integer. +#' @return Data frame containing validated routine data. +load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (cast_year_month && all(c("YEAR", "MONTH") %in% colnames(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + if (!is.null(required_indicators)) { + missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) + if (length(missing_indicators) > 0) { + msg <- paste("[ERROR] Missing indicator column(s) in routine data:", paste(missing_indicators, collapse = ", ")) + log_msg(msg) + stop(msg) + } + } + + dhis2_routine +} + +#' Impute flagged outliers using a centered rolling median. +#' +#' For each ADM/OU/indicator time series, values marked as outliers are +#' replaced by a 3-point centered rolling median (ceiling), preserving +#' non-outlier observations. +#' +#' @param dt Routine data in long format. +#' @param outlier_col Name of the logical outlier flag column. +#' @return Data frame with VALUE_IMPUTED column and helper columns removed. +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + dt[, MEDIAN_IMPUTED := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(median(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MEDIAN_IMPUTED, TO_IMPUTE)] + dt[, c("TO_IMPUTE", "MEDIAN_IMPUTED") := NULL] + return(as.data.frame(data.table::copy(dt))) +} + +#' Build final routine output tables (imputed or removed). +#' +#' Reshapes long-format routine values back to wide indicator columns, joins +#' location names, and standardizes output columns expected by downstream +#' datasets and reporting. +#' +#' @param df Long-format routine data including VALUE_IMPUTED. +#' @param outlier_column Outlier flag column used to filter removed records. +#' @param DHIS2_INDICATORS Indicator columns to keep in the final table. +#' @param fixed_cols Fixed identifier/date columns in long format. +#' @param pyramid_names Mapping table with ADM/OU names. +#' @param remove Logical; when TRUE returns outlier-removed data. +#' @return Wide routine data frame ready for export. +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} + diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r new file mode 100644 index 0000000..ea3d558 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r @@ -0,0 +1,312 @@ +# Report helpers for median outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.candidate_files <- unique(c( + if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils", "snt_dhis2_outliers_imputation_median.r") + } else { + character(0) + }, + if (!is.na(.this_file)) { + file.path(dirname(.this_file), "snt_dhis2_outliers_imputation_median.r") + } else { + character(0) + }, + file.path(getwd(), "snt_dhis2_outliers_imputation_median.r") +)) +.target_file <- .candidate_files[file.exists(.candidate_files)][1] +if (is.na(.target_file)) { + stop(paste0( + "Could not locate snt_dhis2_outliers_imputation_median.r. Tried: ", + paste(.candidate_files, collapse = " | ") + )) +} +source(.target_file) + +`%||%` <- function(x, y) if (!is.null(x)) x else y + +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Outliers for indicator:", ind_name), + subtitle = "Grey = all values, red = detected outliers", + x = "Year", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 14) +} + +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + if (nrow(df_ind) == 0) { + return(NULL) + } + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Outliers by district and year:", ind_name), + x = "District", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 12) +} + +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!all(c("YEAR", "check_label", "pct_coherent") %in% names(df))) return(NULL) + if (!agg_level %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate(YEAR = as.integer(.data$YEAR)) %>% + dplyr::filter(.data$YEAR == as.integer(selected_year)) %>% + dplyr::mutate( + agg = as.character(.data[[agg_level]]), + check_label = as.character(.data$check_label) + ) + + if (nrow(d) == 0) return(NULL) + + p <- ggplot2::ggplot(d, ggplot2::aes( + x = .data$check_label, + y = .data$agg, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile() + + ggplot2::scale_fill_viridis_c( + name = "% coherent", + option = "viridis", + limits = c(0, 100) + ) + + ggplot2::labs( + title = sprintf("Coherence (%s) - %s", agg_level, selected_year), + x = NULL, + y = NULL + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), + plot.title = ggplot2::element_text(face = "bold") + ) + + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = 8, dpi = 150) + } + + if (do_plot) print(p) + invisible(p) +} + +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!inherits(map_data, "sf")) return(NULL) + if (!col_name %in% names(map_data)) return(NULL) + + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = NA) + + ggplot2::scale_fill_viridis_c( + option = "viridis", + name = indicator_label %||% col_name, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::labs(title = indicator_label %||% col_name) + + ggplot2::theme_void(base_size = 12) + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", hjust = 0.5), + legend.position = "right" + ) +} + +get_coherence_definitions <- function() { + checks <- list( + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") + ) + + check_labels <- c( + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" + ) + + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + if (length(check_cols) == 0) { + return(tibble::tibble( + YEAR = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = factor() + )) + } + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) + ) +} + +plot_national_coherence_heatmap <- function(coherency_metrics) { + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 + ) + + ggplot2::labs( + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") + ) +} + +compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + if (length(valid_checks) == 0) { + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise(total_reports = dplyr::n(), .groups = "drop") %>% + dplyr::filter(.data$total_reports >= min_reports) + adm_long <- tibble::tibble( + ADM1_NAME = character(), + ADM2_NAME = character(), + ADM2_ID = character(), + YEAR = integer(), + total_reports = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = character() + ) + return(list(adm_coherence = adm_coherence, adm_long = adm_long)) + } + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) + ) %>% + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb index 7504401..0c57694 100644 --- a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb @@ -65,26 +65,18 @@ }, "outputs": [], "source": [ - "# Project folders\n", - "ROOT_PATH <- \"~/workspace\" \n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "# Project folders (ROOT_PATH injected by pipeline if available)\n", + "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_path\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared helpers for this pipeline (code)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_path.r\"))\n", + "setup_ctx <- bootstrap_path_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, { @@ -128,15 +120,9 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", + "log_msg(glue(\"SNT configuration loaded from : {file.path(setup_ctx$CONFIG_PATH, 'SNT_config.json')}\"))" ] }, { @@ -150,16 +136,7 @@ }, "outputs": [], "source": [ - "# Check SNT configuration \n", - "for (conf in c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\")) {\n", - " print(glue(\"{conf} : {config_json$SNT_CONFIG[conf]}\"))\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", + "# Configuration validation is handled in pipeline.py\n", "# Set config vars\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", @@ -199,15 +176,12 @@ "source": [ "# Load file from dataset (formatting)\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", + "dhis2_routine <- load_routine_data(\n", + " dataset_name = dataset_name,\n", + " country_code = COUNTRY_CODE,\n", + " required_indicators = DHIS2_INDICATORS\n", + ")\n", "\n", - "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", - "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", "head(dhis2_routine, 4)" ] @@ -231,14 +205,7 @@ }, "outputs": [], "source": [ - "# Raise an error if any of DHIS2_INDICATORS are not present in the dhis2 routine data.\n", - "for (ind in DHIS2_INDICATORS) { \n", - " if (!(ind %in% colnames(dhis2_routine))) {\n", - " msg <- paste(\"[ERROR] Missing indicator column in routine data: \", ind)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}" + "# Indicator validation is handled inside load_routine_data()." ] }, { @@ -270,12 +237,8 @@ }, "outputs": [], "source": [ - "dhis2_routine_long <- dhis2_routine %>%\n", - " select(all_of(c(\"ADM1_ID\", \"ADM1_NAME\", \"ADM2_ID\", \"ADM2_NAME\", \"OU_ID\", \"OU_NAME\", \"PERIOD\", DHIS2_INDICATORS))) %>%\n", - " pivot_longer(cols = all_of(DHIS2_INDICATORS), names_to = \"INDICATOR\", values_to = \"VALUE\") %>%\n", - " # ⚠️ NEW: Complete missing date-indicator combinations for each facility\n", - " complete(nesting(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME), PERIOD, INDICATOR) %>%\n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"INDICATOR\", \"VALUE\")))\n", + "# Helper loaded from utils/snt_dhis2_outliers_imputation_path.r\n", + "dhis2_routine_long <- build_path_routine_long(dhis2_routine, DHIS2_INDICATORS)\n", "\n", "print(dim(dhis2_routine_long))\n", "head(dhis2_routine_long, 2)" @@ -300,17 +263,10 @@ }, "outputs": [], "source": [ - "# check if there are any duplicates\n", - "duplicated <- dhis2_routine_long %>%\n", - " group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>%\n", - " summarise(n = dplyr::n(), .groups= \"drop\") %>%\n", - " filter(n > 1L)\n", - "\n", - "# Remove dups\n", + "dedup_result <- remove_path_duplicates(dhis2_routine_long)\n", + "dhis2_routine_long <- dedup_result$data\n", + "duplicated <- dedup_result$duplicated\n", "if (nrow(duplicated) > 0) {\n", - " log_msg(glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", - " dhis2_routine_long <- dhis2_routine_long %>%\n", - " distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR, .keep_all = TRUE)\n", " head(duplicated)\n", "}" ] @@ -413,23 +369,7 @@ }, "outputs": [], "source": [ - "# high presumed cases during lower tests\n", - "low_testing_periods <- dhis2_routine_outliers %>%\n", - " filter(INDICATOR == \"TEST\") %>%\n", - " mutate(\n", - " low_testing = case_when(VALUE < MEAN_80 ~ TRUE, TRUE ~ FALSE), \n", - " # presumed may not exceed upper limits for tests \n", - " upper_limit_tested = MEAN_80 + MEAN_DEVIATION * SD_80) %>% \n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"low_testing\", \"upper_limit_tested\")))\n", - "\n", - "# decide which one could be possible stock-out periods\n", - "possible_stockout <- dhis2_routine_outliers %>%\n", - " filter(OUTLIER_TREND == TRUE) %>%\n", - " left_join(low_testing_periods, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\")) %>% \n", - " # make sure value does not exceed reasonable figures\n", - " mutate(POSSIBLE_STKOUT = case_when(low_testing == TRUE & INDICATOR == \"PRES\" & VALUE < upper_limit_tested ~ TRUE, TRUE ~ FALSE)) %>%\n", - " filter(POSSIBLE_STKOUT == TRUE) %>%\n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"POSSIBLE_STKOUT\")))" + "possible_stockout <- detect_possible_stockout(dhis2_routine_outliers, MEAN_DEVIATION)" ] }, { @@ -481,23 +421,7 @@ }, "outputs": [], "source": [ - "# ⚠️ UPDATED\n", - "possible_epidemic <- dhis2_routine_outliers %>% \n", - " filter(INDICATOR == \"TEST\" | INDICATOR == \"CONF\") %>%\n", - " rename(total = VALUE) %>% \n", - " # outlier threshold max value\n", - " mutate(max_value = MEAN_80 + MEAN_DEVIATION * SD_80) %>% \n", - " # remove columns not necessary for wider format\n", - " select(-c(\"MEAN_80\", \"SD_80\")) %>% \n", - " # wider format with two values (value and outlier-threshold max value) for each INDICATOR\n", - " pivot_wider(names_from = INDICATOR, values_from = c(total, max_value, OUTLIER_TREND)) %>% \n", - " unnest(cols = everything()) %>% \n", - " # ⚠️ NEW LOGIC: epidemic if CONF is outlier AND (TEST is outlier OR tests >= confirmed)\n", - " mutate(POSSIBLE_EPID = case_when(\n", - " OUTLIER_TREND_CONF == TRUE & (OUTLIER_TREND_TEST == TRUE | total_TEST >= total_CONF) ~ TRUE,\n", - " TRUE ~ FALSE)) %>%\n", - " filter(POSSIBLE_EPID == TRUE) %>% \n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"POSSIBLE_EPID\")))\n", + "possible_epidemic <- detect_possible_epidemic(dhis2_routine_outliers, MEAN_DEVIATION)\n", "\n", "epidemic_n <- length(unique(possible_epidemic$OU_ID))\n", "if (epidemic_n > 0) { \n", @@ -531,34 +455,11 @@ }, "outputs": [], "source": [ - "# Join columns and correct outliers column\n", - "routine_data_outliers_clean <- dhis2_routine_outliers %>% \n", - " left_join(possible_stockout, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\")) %>%\n", - " mutate(OUTLIER_TREND_01 = case_when(OUTLIER_TREND == TRUE & INDICATOR ==\"PRES\" & POSSIBLE_STKOUT == TRUE ~ FALSE, TRUE ~ OUTLIER_TREND)) %>%\n", - " left_join(possible_epidemic, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\")) %>%\n", - " mutate(OUTLIER_TREND_02 = case_when(OUTLIER_TREND_01 == TRUE & INDICATOR %in% c(\"CONF\", \"TEST\") & POSSIBLE_EPID == TRUE ~ TRUE, TRUE ~ OUTLIER_TREND_01)) %>% \n", - " select(-OUTLIER_TREND) %>%\n", - " rename(OUTLIER_TREND = OUTLIER_TREND_02) %>% \n", - " mutate(\n", - " YEAR = as.integer(substr(PERIOD, 1, 4)),\n", - " MONTH = as.integer(substr(PERIOD, 5, 6))) %>%\n", - " select(all_of(\n", - " c(\n", - " \"PERIOD\",\n", - " \"YEAR\",\n", - " \"MONTH\",\n", - " \"ADM1_ID\",\n", - " \"ADM2_ID\",\n", - " \"OU_ID\", \n", - " \"INDICATOR\",\n", - " \"VALUE\",\n", - " \"MEAN_80\",\n", - " \"SD_80\",\n", - " \"OUTLIER_TREND\",\n", - " \"POSSIBLE_STKOUT\", \n", - " \"POSSIBLE_EPID\" \n", - " )\n", - " ))\n", + "routine_data_outliers_clean <- build_path_clean_outliers(\n", + " dhis2_routine_outliers = dhis2_routine_outliers,\n", + " possible_stockout = possible_stockout,\n", + " possible_epidemic = possible_epidemic\n", + ")\n", "\n", "print(dim(routine_data_outliers_clean))\n", "head(routine_data_outliers_clean, 2)" @@ -616,35 +517,7 @@ }, "outputs": [], "source": [ - "# ⚠️ UPDATED: Added reversal check to prevent illogical corrections\n", - "# replace outliers by mean_80\n", - "routine_data_outliers_imputed <- routine_data_outliers_clean %>% \n", - " rename(VALUE_OLD = VALUE) %>% \n", - " # replace outliers with the mean 80% value\n", - " mutate(VALUE_IMPUTED = ifelse(OUTLIER_TREND == TRUE, MEAN_80, VALUE_OLD)) %>%\n", - " # ⚠️ NEW: Pivot to check test/conf relationship\n", - " select(all_of(c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"INDICATOR\", \"VALUE_OLD\", \"VALUE_IMPUTED\", \"OUTLIER_TREND\"))) %>%\n", - " pivot_wider(names_from = INDICATOR, values_from = c(VALUE_OLD, VALUE_IMPUTED, OUTLIER_TREND)) %>%\n", - " # ⚠️ NEW: Reversal check - undo corrections if they create impossible situations\n", - " # (i.e., if imputed tests < imputed conf, but original tests > original conf)\n", - " mutate(reverse_val = case_when(\n", - " !is.na(VALUE_IMPUTED_TEST) & !is.na(VALUE_IMPUTED_CONF) & \n", - " VALUE_IMPUTED_TEST < VALUE_IMPUTED_CONF & \n", - " VALUE_OLD_TEST > VALUE_OLD_CONF ~ TRUE,\n", - " TRUE ~ FALSE)) %>%\n", - " # Correct TEST if reversed\n", - " mutate(VALUE_IMPUTED_TEST = ifelse(reverse_val == TRUE, VALUE_OLD_TEST, VALUE_IMPUTED_TEST),\n", - " OUTLIER_TREND_TEST = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_TEST)) %>%\n", - " # Correct CONF if reversed\n", - " mutate(VALUE_IMPUTED_CONF = ifelse(reverse_val == TRUE, VALUE_OLD_CONF, VALUE_IMPUTED_CONF),\n", - " OUTLIER_TREND_CONF = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_CONF)) %>%\n", - " # ⚠️ Pivot back to long format\n", - " select(-reverse_val) %>%\n", - " pivot_longer(cols = starts_with(\"VALUE_OLD_\") | starts_with(\"VALUE_IMPUTED_\") | starts_with(\"OUTLIER_TREND_\"),\n", - " names_to = c(\".value\", \"INDICATOR\"),\n", - " names_pattern = \"(.*)_(.*)$\") %>%\n", - " arrange(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"INDICATOR\") %>%\n", - " select(all_of(c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"INDICATOR\", \"VALUE_OLD\", \"VALUE_IMPUTED\", \"OUTLIER_TREND\")))\n", + "routine_data_outliers_imputed <- impute_path_outliers(routine_data_outliers_clean)\n", "\n", "print(dim(routine_data_outliers_imputed))\n", "head(routine_data_outliers_imputed, 2)" @@ -800,7 +673,7 @@ }, "outputs": [], "source": [ - "output_path <- file.path(DATA_PATH , \"dhis2\", \"outliers_imputation\")\n", + "output_path <- OUTPUT_DIR\n", "\n", "# Save routine outliers table (parquet)\n", "outliers_parquet <- file.path(output_path , paste0(COUNTRY_CODE, \"_routine_outliers_detected.parquet\")) \n", diff --git a/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r new file mode 100644 index 0000000..288c12c --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r @@ -0,0 +1,255 @@ +# Helpers for PATH outliers imputation notebook. + +#' Initialize runtime context for the PATH outliers pipeline. +#' +#' Creates standard project paths, loads shared dependencies and utilities, +#' initializes OpenHEXA SDK access, and returns a context object used by +#' notebooks. +#' +#' @param root_path Project root folder (workspace). +#' @param required_packages Character vector of R packages to install/load. +#' @param load_openhexa Logical; import OpenHEXA SDK when TRUE. +#' @return Named list with paths, OpenHEXA handle, and parsed config. +bootstrap_path_context <- function( + root_path = "~/workspace", + required_packages = c("arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", "reticulate", "glue"), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + output_dir <- file.path(data_path, "dhis2", "outliers_imputation") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + assign("openhexa", openhexa, envir = .GlobalEnv) + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + OUTPUT_DIR = output_dir, + openhexa = openhexa, + config_json = config_json + )) +} + +#' Load DHIS2 routine input data with validation and logging. +#' +#' Reads the latest routine parquet file from OpenHEXA, logs dataset details, +#' optionally casts YEAR and MONTH to integers, and validates indicator columns. +#' Stops execution with a clear error when required fields are missing. +#' +#' @param dataset_name OpenHEXA dataset identifier/name. +#' @param country_code Country code used in routine filename prefix. +#' @param required_indicators Optional character vector of required indicators. +#' @param cast_year_month Logical; cast YEAR/MONTH columns to integer. +#' @return Data frame containing validated routine data. +load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (cast_year_month && all(c("YEAR", "MONTH") %in% colnames(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + if (!is.null(required_indicators)) { + missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) + if (length(missing_indicators) > 0) { + msg <- paste("[ERROR] Missing indicator column(s) in routine data:", paste(missing_indicators, collapse = ", ")) + log_msg(msg) + stop(msg) + } + } + + dhis2_routine +} + +#' Build PATH-ready long routine table. +#' +#' Selects required administrative/time columns, pivots indicator values to +#' long format, and completes missing PERIOD/INDICATOR combinations for each +#' location. +#' +#' @param dhis2_routine Routine data in wide format. +#' @param DHIS2_INDICATORS Indicator column names to pivot. +#' @return Long-format routine data frame used by PATH detection. +build_path_routine_long <- function(dhis2_routine, DHIS2_INDICATORS) { + dhis2_routine %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM1_NAME", "ADM2_ID", "ADM2_NAME", "OU_ID", "OU_NAME", "PERIOD", DHIS2_INDICATORS))) %>% + tidyr::pivot_longer(cols = dplyr::all_of(DHIS2_INDICATORS), names_to = "INDICATOR", values_to = "VALUE") %>% + tidyr::complete(tidyr::nesting(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME), PERIOD, INDICATOR) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "INDICATOR", "VALUE"))) +} + +#' Remove duplicate observations in PATH long routine data. +#' +#' Detects duplicate keys at ADM/OU/PERIOD/INDICATOR level, logs duplicate +#' counts, and keeps distinct rows only when duplicates exist. +#' +#' @param dhis2_routine_long Long-format routine data frame. +#' @return List with cleaned `data` and `duplicated` summary table. +remove_path_duplicates <- function(dhis2_routine_long) { + duplicated <- dhis2_routine_long %>% + dplyr::group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>% + dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% + dplyr::filter(n > 1L) + + if (nrow(duplicated) > 0) { + log_msg(glue::glue("Removing {nrow(duplicated)} duplicated values.")) + dhis2_routine_long <- dhis2_routine_long %>% + dplyr::distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR, .keep_all = TRUE) + } + + list(data = dhis2_routine_long, duplicated = duplicated) +} + +#' Detect potential stock-out exceptions in PATH logic. +#' +#' Flags periods where PRES is marked outlier while TEST is unusually low and +#' PRES remains within a reasonable upper range, indicating likely stock-out +#' behavior rather than true anomaly. +#' +#' @param dhis2_routine_outliers Routine table with OUTLIER_TREND and stats. +#' @param MEAN_DEVIATION Deviation multiplier used in PATH thresholds. +#' @return Data frame of flagged stock-out exception keys. +detect_possible_stockout <- function(dhis2_routine_outliers, MEAN_DEVIATION) { + low_testing_periods <- dhis2_routine_outliers %>% + dplyr::filter(INDICATOR == "TEST") %>% + dplyr::mutate( + low_testing = dplyr::case_when(VALUE < MEAN_80 ~ TRUE, TRUE ~ FALSE), + upper_limit_tested = MEAN_80 + MEAN_DEVIATION * SD_80 + ) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "low_testing", "upper_limit_tested"))) + + dhis2_routine_outliers %>% + dplyr::filter(OUTLIER_TREND == TRUE) %>% + dplyr::left_join(low_testing_periods, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% + dplyr::mutate(POSSIBLE_STKOUT = dplyr::case_when(low_testing == TRUE & INDICATOR == "PRES" & VALUE < upper_limit_tested ~ TRUE, TRUE ~ FALSE)) %>% + dplyr::filter(POSSIBLE_STKOUT == TRUE) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "POSSIBLE_STKOUT"))) +} + +#' Detect potential epidemic exceptions in PATH logic. +#' +#' Identifies periods where CONF is outlier and TEST also supports epidemic +#' behavior (test outlier or TEST >= CONF), so values should not be suppressed +#' as reporting anomalies. +#' +#' @param dhis2_routine_outliers Routine table with OUTLIER_TREND and stats. +#' @param MEAN_DEVIATION Deviation multiplier used in PATH thresholds. +#' @return Data frame of flagged epidemic exception keys. +detect_possible_epidemic <- function(dhis2_routine_outliers, MEAN_DEVIATION) { + dhis2_routine_outliers %>% + dplyr::filter(INDICATOR == "TEST" | INDICATOR == "CONF") %>% + dplyr::rename(total = VALUE) %>% + dplyr::mutate(max_value = MEAN_80 + MEAN_DEVIATION * SD_80) %>% + dplyr::select(-c("MEAN_80", "SD_80")) %>% + tidyr::pivot_wider(names_from = INDICATOR, values_from = c(total, max_value, OUTLIER_TREND)) %>% + tidyr::unnest(cols = dplyr::everything()) %>% + dplyr::mutate(POSSIBLE_EPID = dplyr::case_when( + OUTLIER_TREND_CONF == TRUE & (OUTLIER_TREND_TEST == TRUE | total_TEST >= total_CONF) ~ TRUE, + TRUE ~ FALSE + )) %>% + dplyr::filter(POSSIBLE_EPID == TRUE) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "POSSIBLE_EPID"))) +} + +#' Apply PATH exception logic and build cleaned outlier table. +#' +#' Joins stock-out and epidemic exception flags, updates OUTLIER_TREND after +#' exception rules, and standardizes key output columns including YEAR/MONTH. +#' +#' @param dhis2_routine_outliers Base PATH outlier table. +#' @param possible_stockout Output from `detect_possible_stockout`. +#' @param possible_epidemic Output from `detect_possible_epidemic`. +#' @return Cleaned long-format outlier table for imputation/export. +build_path_clean_outliers <- function(dhis2_routine_outliers, possible_stockout, possible_epidemic) { + dhis2_routine_outliers %>% + dplyr::left_join(possible_stockout, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% + dplyr::mutate(OUTLIER_TREND_01 = dplyr::case_when(OUTLIER_TREND == TRUE & INDICATOR == "PRES" & POSSIBLE_STKOUT == TRUE ~ FALSE, TRUE ~ OUTLIER_TREND)) %>% + dplyr::left_join(possible_epidemic, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% + dplyr::mutate(OUTLIER_TREND_02 = dplyr::case_when(OUTLIER_TREND_01 == TRUE & INDICATOR %in% c("CONF", "TEST") & POSSIBLE_EPID == TRUE ~ TRUE, TRUE ~ OUTLIER_TREND_01)) %>% + dplyr::select(-OUTLIER_TREND) %>% + dplyr::rename(OUTLIER_TREND = OUTLIER_TREND_02) %>% + dplyr::mutate( + YEAR = as.integer(substr(PERIOD, 1, 4)), + MONTH = as.integer(substr(PERIOD, 5, 6)) + ) %>% + dplyr::select(dplyr::all_of(c( + "PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID", + "INDICATOR", "VALUE", "MEAN_80", "SD_80", + "OUTLIER_TREND", "POSSIBLE_STKOUT", "POSSIBLE_EPID" + ))) +} + +#' Impute PATH outliers and enforce TEST/CONF consistency. +#' +#' Replaces flagged values using MEAN_80, reshapes data to evaluate TEST vs CONF +#' consistency, and reverts impossible imputations when they create TEST < CONF +#' while original values were logically consistent. +#' +#' @param routine_data_outliers_clean Clean outlier table from PATH logic. +#' @return Long-format routine table with VALUE_OLD, VALUE_IMPUTED and flags. +impute_path_outliers <- function(routine_data_outliers_clean) { + routine_data_outliers_clean %>% + dplyr::rename(VALUE_OLD = VALUE) %>% + dplyr::mutate(VALUE_IMPUTED = ifelse(OUTLIER_TREND == TRUE, MEAN_80, VALUE_OLD)) %>% + dplyr::select(dplyr::all_of(c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID", "INDICATOR", "VALUE_OLD", "VALUE_IMPUTED", "OUTLIER_TREND"))) %>% + tidyr::pivot_wider(names_from = INDICATOR, values_from = c(VALUE_OLD, VALUE_IMPUTED, OUTLIER_TREND)) %>% + dplyr::mutate(reverse_val = dplyr::case_when( + !is.na(VALUE_IMPUTED_TEST) & !is.na(VALUE_IMPUTED_CONF) & + VALUE_IMPUTED_TEST < VALUE_IMPUTED_CONF & + VALUE_OLD_TEST > VALUE_OLD_CONF ~ TRUE, + TRUE ~ FALSE + )) %>% + dplyr::mutate( + VALUE_IMPUTED_TEST = ifelse(reverse_val == TRUE, VALUE_OLD_TEST, VALUE_IMPUTED_TEST), + OUTLIER_TREND_TEST = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_TEST) + ) %>% + dplyr::mutate( + VALUE_IMPUTED_CONF = ifelse(reverse_val == TRUE, VALUE_OLD_CONF, VALUE_IMPUTED_CONF), + OUTLIER_TREND_CONF = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_CONF) + ) %>% + dplyr::select(-reverse_val) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("VALUE_OLD_") | dplyr::starts_with("VALUE_IMPUTED_") | dplyr::starts_with("OUTLIER_TREND_"), + names_to = c(".value", "INDICATOR"), + names_pattern = "(.*)_(.*)$" + ) %>% + dplyr::arrange(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>% + dplyr::select(dplyr::all_of(c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID", "INDICATOR", "VALUE_OLD", "VALUE_IMPUTED", "OUTLIER_TREND"))) +} diff --git a/snt_dhis2_outliers_imputation_iqr/pipeline.py b/snt_dhis2_outliers_imputation_iqr/pipeline.py index c6bda5b..928acba 100644 --- a/snt_dhis2_outliers_imputation_iqr/pipeline.py +++ b/snt_dhis2_outliers_imputation_iqr/pipeline.py @@ -81,7 +81,6 @@ def snt_dhis2_outliers_imputation_iqr( if not run_report_only: input_params = { "ROOT_PATH": Path(workspace.files_path).as_posix(), - "OUTLIERS_METHOD": "IQR", "DEVIATION_IQR": deviation_iqr, } run_start_ts = time.time() diff --git a/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py b/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py index 29a678e..8844d9a 100644 --- a/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py +++ b/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py @@ -57,12 +57,9 @@ def snt_dhis2_outliers_imputation_magic_glasses( mode_clean = (mode or "partial").strip().lower() if mode_clean not in ("partial", "complete"): raise ValueError('mode must be "partial" or "complete".') - run_mg_partial = True run_mg_complete = mode_clean == "complete" current_run.log_info(f"Selected detection mode: {mode_clean}") - current_run.log_info( - f"Flags => RUN_MAGIC_GLASSES_PARTIAL={run_mg_partial}, RUN_MAGIC_GLASSES_COMPLETE={run_mg_complete}" - ) + current_run.log_info(f"RUN_MAGIC_GLASSES_COMPLETE={run_mg_complete}") if run_mg_complete: current_run.log_warning( "Complete mode selected: seasonal detection is very slow and can take several hours to run." @@ -101,8 +98,6 @@ def snt_dhis2_outliers_imputation_magic_glasses( input_params = { "ROOT_PATH": Path(workspace.files_path).as_posix(), - "OUTLIERS_METHOD": "MG_COMPLETE" if run_mg_complete else "MG_PARTIAL", - "RUN_MAGIC_GLASSES_PARTIAL": run_mg_partial, "RUN_MAGIC_GLASSES_COMPLETE": run_mg_complete, "DEVIATION_MAD15": 15, "DEVIATION_MAD10": 10, diff --git a/snt_dhis2_outliers_imputation_mean/pipeline.py b/snt_dhis2_outliers_imputation_mean/pipeline.py index be6cc22..013183b 100644 --- a/snt_dhis2_outliers_imputation_mean/pipeline.py +++ b/snt_dhis2_outliers_imputation_mean/pipeline.py @@ -81,7 +81,6 @@ def snt_dhis2_outliers_imputation_mean( if not run_report_only: input_params = { "ROOT_PATH": Path(workspace.files_path).as_posix(), - "OUTLIERS_METHOD": "MEAN", "DEVIATION_MEAN": deviation_mean, } run_start_ts = time.time() diff --git a/snt_dhis2_outliers_imputation_median/pipeline.py b/snt_dhis2_outliers_imputation_median/pipeline.py index 0a94ce5..220eb0e 100644 --- a/snt_dhis2_outliers_imputation_median/pipeline.py +++ b/snt_dhis2_outliers_imputation_median/pipeline.py @@ -81,7 +81,6 @@ def snt_dhis2_outliers_imputation_median( if not run_report_only: input_params = { "ROOT_PATH": Path(workspace.files_path).as_posix(), - "OUTLIERS_METHOD": "MEDIAN", "DEVIATION_MEDIAN": deviation_median, } run_start_ts = time.time() diff --git a/snt_dhis2_outliers_imputation_path/pipeline.py b/snt_dhis2_outliers_imputation_path/pipeline.py index d3a55a0..6511d24 100644 --- a/snt_dhis2_outliers_imputation_path/pipeline.py +++ b/snt_dhis2_outliers_imputation_path/pipeline.py @@ -84,7 +84,6 @@ def snt_dhis2_outliers_imputation_path( if not run_report_only: input_params = { "ROOT_PATH": Path(workspace.files_path).as_posix(), - "OUTLIERS_METHOD": "PATH", "DEVIATION_MEAN": deviation_mean, } run_start_ts = time.time()