From f5af3bfdd2beceb8884b220a5a5109ce20eb04f6 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 24 Mar 2026 13:05:31 +0100 Subject: [PATCH 01/17] Refactor outlier imputation notebooks to utilize a shared bootstrap context for configuration and package management. Introduced utility functions for imputation and reporting, enhancing modularity and readability. Updated paths and error handling for loading configurations. --- .../snt_dhis2_outliers_imputation_iqr.ipynb | 94 +++----- ...dhis2_outliers_imputation_iqr_report.ipynb | 214 ++---------------- .../utils/bootstrap.R | 45 ++++ .../utils/imputation_utils.R | 47 ++++ .../utils/reporting_utils.R | 166 ++++++++++++++ 5 files changed, 312 insertions(+), 254 deletions(-) create mode 100644 pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb index 9d03d45..dd66522 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb @@ -59,24 +59,22 @@ "source": [ "# Project folders (ROOT_PATH injected by pipeline if not set)\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_iqr\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_iqr_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c( \"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Pipeline-specific helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" ] }, { @@ -120,14 +118,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" ] }, @@ -536,23 +528,8 @@ }, "outputs": [], "source": [ - "# Define helper function to compute moving average for an outlier column\n", - "start_time <- Sys.time()\n", - "\n", - "impute_outliers_dt <- function(dt, outlier_col) {\n", - " dt <- as.data.table(dt) # transform to datatable\n", - " setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) \n", - " dt[, TO_IMPUTE := fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] # Compute TO_IMPUTE column\n", - " \n", - " # Fast rolling mean by group\n", - " dt[, MOVING_AVG := frollapply(TO_IMPUTE, n = 3, FUN = function(x) ceiling(mean(x, na.rm = TRUE)), align = \"center\"), \n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)]\n", - " \n", - " dt[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] \n", - " dt[, c(\"TO_IMPUTE\") := NULL] # clean up \"MOVING_AVG\"\n", - " \n", - " return(as.data.frame(copy(dt)))\n", - "}" + "# Helper loaded from utils/imputation_utils.R\n", + "start_time <- Sys.time()" ] }, { @@ -629,24 +606,7 @@ }, "outputs": [], "source": [ - "# Define helper function to format both versions \n", - "format_routine_data_selection <- function(df, outlier_column, remove = FALSE) {\n", - " \n", - " # remove outliers \n", - " if (remove) df <- df %>% filter(!.data[[outlier_column]])\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", DHIS2_INDICATORS)\n", - " \n", - " output <- df %>%\n", - " select(-VALUE) %>%\n", - " rename(VALUE = VALUE_IMPUTED) %>%\n", - " select(all_of(fixed_cols), INDICATOR, VALUE) %>% # global: fixed_cols\n", - " mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>%\n", - " pivot_wider(names_from = \"INDICATOR\", values_from = \"VALUE\") %>%\n", - " left_join(pyramid_names, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"))\n", - "\n", - " output %>% select(all_of(intersect(target_cols, names(output))))\n", - "}" + "# Helper loaded from utils/imputation_utils.R" ] }, { @@ -661,8 +621,22 @@ "outputs": [], "source": [ "# Format IQR tables (imputed and removed)\n", - "dhis2_routine_iqr_imputed <- format_routine_data_selection(dhis2_routine_outliers_iqr_imputed, iqr_column)\n", - "dhis2_routine_iqr_removed <- format_routine_data_selection(dhis2_routine_outliers_iqr_imputed, iqr_column, remove = TRUE)" + "dhis2_routine_iqr_imputed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_iqr_imputed,\n", + " outlier_column = iqr_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names\n", + ")\n", + "\n", + "dhis2_routine_iqr_removed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_iqr_imputed,\n", + " outlier_column = iqr_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names,\n", + " remove = TRUE\n", + ")" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb index 5e9c896..2fea8de 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb @@ -35,22 +35,21 @@ "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_iqr\")\n", "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_iqr_context(\n", + " root_path = SNT_ROOT_PATH,\n", + " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + ")\n", "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Reporting helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" ] }, { @@ -64,13 +63,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "\n", "# Configuration variables\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", @@ -94,10 +88,7 @@ }, "outputs": [], "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" + "# Helper loaded from utils/reporting_utils.R" ] }, { @@ -231,76 +222,9 @@ }, "outputs": [], "source": [ - "#--- FUNCTIONS TO MAKE ONE PLOT ---\n", - "plot_outliers <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>% filter(INDICATOR == ind_name)\n", - "\n", - " # Remove infinite or impossible values explicitly → removes warnings\n", - " df_ind <- df_ind %>% \n", - " filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE))\n", - "\n", - " p <- ggplot(df_ind, aes(x = YEAR, y = VALUE)) +\n", - " \n", - " # All values (grey)\n", - " geom_point(alpha = 0.25, color = \"grey40\", na.rm = TRUE) +\n", - " \n", - " # Outliers (red)\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " aes(x = YEAR, y = VALUE),\n", - " color = \"red\",\n", - " size = 2.8,\n", - " alpha = 0.85,\n", - " na.rm = TRUE\n", - " ) +\n", - " \n", - " labs(\n", - " title = paste(\"Inspection des valeurs aberrantes pour indicateur:\", ind_name),\n", - " subtitle = \"Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées\",\n", - " x = \"Année\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 14)\n", - "\n", - " return(p)\n", - "}\n", - "\n", - "#plots <- map(unique_inds, ~ plot_outliers(.x, routine_data, outlier_col))\n", - "#walk(plots, print)\n", - "\n", - "plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>%\n", - " filter(\n", - " INDICATOR == ind_name,\n", - " !is.na(YEAR),\n", - " !is.na(VALUE),\n", - " is.finite(VALUE)\n", - " )\n", - " \n", - " if (nrow(df_ind) == 0) return(NULL)\n", - " \n", - " ggplot(df_ind, aes(x = ADM2_ID, y = VALUE)) +\n", - " geom_point(color = \"grey60\", alpha = 0.3) +\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " color = \"red\", \n", - " size = 2.8,\n", - " alpha = 0.85\n", - " ) +\n", - " facet_wrap(~ YEAR, scales = \"free_y\") +\n", - " labs(\n", - " title = paste(\"Détection des valeurs aberrantes —\", ind_name),\n", - " subtitle = paste(\"Méthode :\", outlier_col, \"| Rouge = valeur aberrante\"),\n", - " x = \"District (ADM2)\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 75, hjust = 1, size = 7)\n", - " )\n", - "}" + "# Plot helpers loaded from utils/reporting_utils.R\n", + "# - plot_outliers()\n", + "# - plot_outliers_by_district_facet_year()" ] }, { @@ -747,69 +671,7 @@ }, "outputs": [], "source": [ - "# Define heatmap function\n", - "plot_coherence_heatmap <- function(df, selected_year, agg_level = \"ADM1_NAME\", filename = NULL, do_plot = TRUE) {\n", - " \n", - " if (!agg_level %in% names(df)) {\n", - " stop(paste0(\"Aggregation level '\", agg_level, \"' not found in data!\"))\n", - " }\n", - " \n", - " # Aggregate pct_coherent by chosen level + check_label\n", - " df_year <- df %>%\n", - " filter(YEAR == selected_year) %>%\n", - " group_by(across(all_of(c(agg_level, \"check_label\")))) %>%\n", - " summarise(\n", - " pct_coherent = mean(pct_coherent, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " group_by(across(all_of(agg_level))) %>%\n", - " mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>%\n", - " ungroup() %>%\n", - " mutate(!!agg_level := fct_reorder(.data[[agg_level]], median_coh))\n", - " \n", - " n_units <- n_distinct(df_year[[agg_level]])\n", - " plot_height <- max(6, 0.5 * n_units) # dynamically adjust height\n", - " agg_label <- if (agg_level == \"ADM1_NAME\") {\n", - " \"niveau administratif 1\"\n", - " } else if (agg_level == \"ADM2_NAME\") {\n", - " \"niveau administratif 2\"\n", - " } else {\n", - " agg_level # fallback, in case a different level is passed\n", - " }\n", - " \n", - " p <- ggplot(df_year, aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) +\n", - " geom_tile(color = \"white\", linewidth = 0.2) +\n", - " geom_text(aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " size = 5, fontface = \"bold\", color = \"white\") +\n", - " scale_fill_viridis(name = \"% cohérent\", limits = c(0, 100),\n", - " option = \"viridis\", direction = -1) +\n", - " labs(\n", - " title = paste0(\"Cohérence des données par \", agg_label, \" - \", selected_year),\n", - " x = \"Règle de cohérence\",\n", - " y = agg_label\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " axis.text.y = element_text(size = 12),\n", - " axis.text.x = element_text(size = 12, angle = 30, hjust = 1),\n", - " plot.title = element_text(size = 16, face = \"bold\", hjust = 0.5),\n", - " legend.title = element_text(size = 12),\n", - " legend.text = element_text(size = 10)\n", - " )\n", - " \n", - " # Adjust notebook display\n", - " options(repr.plot.width = 14, repr.plot.height = plot_height)\n", - " \n", - " # Save if filename is provided\n", - " if (!is.null(filename)) {\n", - " ggsave(filename = filename, plot = p,\n", - " width = 14, height = plot_height, dpi = 300,\n", - " limitsize = FALSE)\n", - " }\n", - " if (do_plot) { print(p) }\n", - " # return(p)\n", - "}" + "# Coherence heatmap helper loaded from utils/reporting_utils.R" ] }, { @@ -876,43 +738,7 @@ }, "outputs": [], "source": [ - "# Define function\n", - "plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) {\n", - " \n", - " # Check if column exists\n", - " if (!col_name %in% names(map_data)) {\n", - " stop(paste0(\"Column '\", col_name, \"' not found in the data!\"))\n", - " }\n", - " \n", - " # Default legend title if not provided\n", - " if (is.null(indicator_label)) {\n", - " indicator_label <- col_name\n", - " }\n", - " \n", - " ggplot(map_data) +\n", - " geom_sf(aes(fill = .data[[col_name]]), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis(\n", - " name = paste0(\"% cohérence\\n(\", indicator_label, \")\"),\n", - " option = \"magma\",\n", - " direction = -1,\n", - " limits = c(0, 100),\n", - " na.value = \"grey90\"\n", - " ) +\n", - " # facet_wrap(~ YEAR) +\n", - " facet_wrap(~ YEAR, drop = TRUE) +\n", - " labs(\n", - " title = \"Cohérence des données par niveau administratif 2 et par année\",\n", - " subtitle = paste(\"Indicateur :\", indicator_label),\n", - " caption = \"Source : DHIS2 données routinières\"\n", - " ) +\n", - " theme_minimal(base_size = 15) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " strip.text = element_text(size = 14, face = \"bold\"),\n", - " plot.title = element_text(size = 20, face = \"bold\"),\n", - " legend.position = \"right\"\n", - " )\n", - "}\n" + "# Coherence map helper loaded from utils/reporting_utils.R" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R new file mode 100644 index 0000000..4f9e7b3 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R @@ -0,0 +1,45 @@ +# Shared bootstrap for the IQR outliers pipeline notebooks. +bootstrap_iqr_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R new file mode 100644 index 0000000..da3b1e9 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R @@ -0,0 +1,47 @@ +# Compute moving-average imputations for a selected outlier flag column. +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + + dt[, MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + dt[, c("TO_IMPUTE") := NULL] + + return(as.data.frame(data.table::copy(dt))) +} + +# Format imputed/removed routine output tables. +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R new file mode 100644 index 0000000..cc6c620 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R @@ -0,0 +1,166 @@ +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), + subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", + x = "Année", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 14) +} + +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + + if (nrow(df_ind) == 0) { + return(NULL) + } + + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Détection des valeurs aberrantes —", ind_name), + subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), + x = "District (ADM2)", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7) + ) +} + +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!agg_level %in% names(df)) { + stop(paste0("Aggregation level '", agg_level, "' not found in data!")) + } + + df_year <- df %>% + dplyr::filter(YEAR == selected_year) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% + dplyr::summarise( + pct_coherent = mean(pct_coherent, na.rm = TRUE), + .groups = "drop" + ) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% + dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% + dplyr::ungroup() %>% + dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) + + n_units <- dplyr::n_distinct(df_year[[agg_level]]) + plot_height <- max(6, 0.5 * n_units) + agg_label <- if (agg_level == "ADM1_NAME") { + "niveau administratif 1" + } else if (agg_level == "ADM2_NAME") { + "niveau administratif 2" + } else { + agg_level + } + + p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + + ggplot2::geom_tile(color = "white", linewidth = 0.2) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), + size = 5, + fontface = "bold", + color = "white" + ) + + viridis::scale_fill_viridis( + name = "% cohérent", + limits = c(0, 100), + option = "viridis", + direction = -1 + ) + + ggplot2::labs( + title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), + x = "Règle de cohérence", + y = agg_label + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + axis.text.y = ggplot2::element_text(size = 12), + axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), + plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), + legend.title = ggplot2::element_text(size = 12), + legend.text = ggplot2::element_text(size = 10) + ) + + options(repr.plot.width = 14, repr.plot.height = plot_height) + + if (!is.null(filename)) { + ggplot2::ggsave( + filename = filename, + plot = p, + width = 14, + height = plot_height, + dpi = 300, + limitsize = FALSE + ) + } + if (do_plot) { + print(p) + } +} + +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!col_name %in% names(map_data)) { + stop(paste0("Column '", col_name, "' not found in the data!")) + } + + if (is.null(indicator_label)) { + indicator_label <- col_name + } + + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + + viridis::scale_fill_viridis( + name = paste0("% cohérence\n(", indicator_label, ")"), + option = "magma", + direction = -1, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::facet_wrap(~ YEAR, drop = TRUE) + + ggplot2::labs( + title = "Cohérence des données par niveau administratif 2 et par année", + subtitle = paste("Indicateur :", indicator_label), + caption = "Source : DHIS2 données routinières" + ) + + ggplot2::theme_minimal(base_size = 15) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + strip.text = ggplot2::element_text(size = 14, face = "bold"), + plot.title = ggplot2::element_text(size = 20, face = "bold"), + legend.position = "right" + ) +} From eb07fe097cfbe070da6f1bc366227dfcb147cd38 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 24 Mar 2026 15:28:33 +0100 Subject: [PATCH 02/17] Refactor outlier imputation notebooks to leverage a shared bootstrap context for configuration and package management. Introduced utility functions for imputation and reporting, improving modularity and readability. Updated paths and error handling for loading configurations across both mean and median imputation pipelines. --- .../snt_dhis2_outliers_imputation_mean.ipynb | 94 +++----- ...his2_outliers_imputation_mean_report.ipynb | 214 ++---------------- .../utils/bootstrap.R | 45 ++++ .../utils/imputation_utils.R | 39 ++++ .../utils/reporting_utils.R | 124 ++++++++++ ...snt_dhis2_outliers_imputation_median.ipynb | 94 +++----- ...s2_outliers_imputation_median_report.ipynb | 214 ++---------------- .../utils/bootstrap.R | 45 ++++ .../utils/imputation_utils.R | 39 ++++ .../utils/reporting_utils.R | 124 ++++++++++ 10 files changed, 524 insertions(+), 508 deletions(-) create mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index b03e267..2f73668 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -59,24 +59,22 @@ "source": [ "# Project folders (ROOT_PATH injected by pipeline if not set)\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_mean\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c( \"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Pipeline-specific helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" ] }, { @@ -120,14 +118,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" ] }, @@ -533,23 +525,8 @@ }, "outputs": [], "source": [ - "# Define helper function to compute moving average for an outlier column\n", - "start_time <- Sys.time()\n", - "\n", - "impute_outliers_dt <- function(dt, outlier_col) {\n", - " dt <- as.data.table(dt) # transform to datatable\n", - " setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) \n", - " dt[, TO_IMPUTE := fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] # Compute TO_IMPUTE column\n", - " \n", - " # Fast rolling mean by group\n", - " dt[, MOVING_AVG := frollapply(TO_IMPUTE, n = 3, FUN = function(x) ceiling(mean(x, na.rm = TRUE)), align = \"center\"), \n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)]\n", - " \n", - " dt[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] \n", - " dt[, c(\"TO_IMPUTE\") := NULL] # clean up \"MOVING_AVG\"\n", - " \n", - " return(as.data.frame(copy(dt)))\n", - "}" + "# Helper loaded from utils/imputation_utils.R\n", + "start_time <- Sys.time()" ] }, { @@ -626,24 +603,7 @@ }, "outputs": [], "source": [ - "# Define helper function to format both versions \n", - "format_routine_data_selection <- function(df, outlier_column, remove = FALSE) {\n", - " \n", - " # remove outliers \n", - " if (remove) df <- df %>% filter(!.data[[outlier_column]])\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", DHIS2_INDICATORS)\n", - " \n", - " output <- df %>%\n", - " select(-VALUE) %>%\n", - " rename(VALUE = VALUE_IMPUTED) %>%\n", - " select(all_of(fixed_cols), INDICATOR, VALUE) %>% # global: fixed_cols\n", - " mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>%\n", - " pivot_wider(names_from = \"INDICATOR\", values_from = \"VALUE\") %>%\n", - " left_join(pyramid_names, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"))\n", - "\n", - " output %>% select(all_of(intersect(target_cols, names(output))))\n", - "}" + "# Helper loaded from utils/imputation_utils.R" ] }, { @@ -658,8 +618,22 @@ "outputs": [], "source": [ "# Format mean tables (imputed and removed)\n", - "dhis2_routine_mean_imputed <- format_routine_data_selection(dhis2_routine_outliers_mean_imputed, mean_column)\n", - "dhis2_routine_mean_removed <- format_routine_data_selection(dhis2_routine_outliers_mean_imputed, mean_column, remove = TRUE)" + "dhis2_routine_mean_imputed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_mean_imputed,\n", + " outlier_column = mean_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names\n", + ")\n", + "\n", + "dhis2_routine_mean_removed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_mean_imputed,\n", + " outlier_column = mean_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names,\n", + " remove = TRUE\n", + ")" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb index 9ced93f..ce58bf5 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb @@ -35,22 +35,21 @@ "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_mean\")\n", "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = SNT_ROOT_PATH,\n", + " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + ")\n", "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Reporting helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" ] }, { @@ -64,13 +63,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "\n", "# Configuration variables\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", @@ -94,10 +88,7 @@ }, "outputs": [], "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" + "# Helper loaded from utils/reporting_utils.R" ] }, { @@ -231,76 +222,9 @@ }, "outputs": [], "source": [ - "#--- FUNCTIONS TO MAKE ONE PLOT ---\n", - "plot_outliers <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>% filter(INDICATOR == ind_name)\n", - "\n", - " # Remove infinite or impossible values explicitly → removes warnings\n", - " df_ind <- df_ind %>% \n", - " filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE))\n", - "\n", - " p <- ggplot(df_ind, aes(x = YEAR, y = VALUE)) +\n", - " \n", - " # All values (grey)\n", - " geom_point(alpha = 0.25, color = \"grey40\", na.rm = TRUE) +\n", - " \n", - " # Outliers (red)\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " aes(x = YEAR, y = VALUE),\n", - " color = \"red\",\n", - " size = 2.8,\n", - " alpha = 0.85,\n", - " na.rm = TRUE\n", - " ) +\n", - " \n", - " labs(\n", - " title = paste(\"Inspection des valeurs aberrantes pour indicateur:\", ind_name),\n", - " subtitle = \"Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées\",\n", - " x = \"Année\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 14)\n", - "\n", - " return(p)\n", - "}\n", - "\n", - "#plots <- map(unique_inds, ~ plot_outliers(.x, routine_data, outlier_col))\n", - "#walk(plots, print)\n", - "\n", - "plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>%\n", - " filter(\n", - " INDICATOR == ind_name,\n", - " !is.na(YEAR),\n", - " !is.na(VALUE),\n", - " is.finite(VALUE)\n", - " )\n", - " \n", - " if (nrow(df_ind) == 0) return(NULL)\n", - " \n", - " ggplot(df_ind, aes(x = ADM2_ID, y = VALUE)) +\n", - " geom_point(color = \"grey60\", alpha = 0.3) +\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " color = \"red\", \n", - " size = 2.8,\n", - " alpha = 0.85\n", - " ) +\n", - " facet_wrap(~ YEAR, scales = \"free_y\") +\n", - " labs(\n", - " title = paste(\"Détection des valeurs aberrantes —\", ind_name),\n", - " subtitle = paste(\"Méthode :\", outlier_col, \"| Rouge = valeur aberrante\"),\n", - " x = \"District (ADM2)\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 75, hjust = 1, size = 7)\n", - " )\n", - "}" + "# Plot helpers loaded from utils/reporting_utils.R\n", + "# - plot_outliers()\n", + "# - plot_outliers_by_district_facet_year()" ] }, { @@ -747,69 +671,7 @@ }, "outputs": [], "source": [ - "# Define heatmap function\n", - "plot_coherence_heatmap <- function(df, selected_year, agg_level = \"ADM1_NAME\", filename = NULL, do_plot = TRUE) {\n", - " \n", - " if (!agg_level %in% names(df)) {\n", - " stop(paste0(\"Aggregation level '\", agg_level, \"' not found in data!\"))\n", - " }\n", - " \n", - " # Aggregate pct_coherent by chosen level + check_label\n", - " df_year <- df %>%\n", - " filter(YEAR == selected_year) %>%\n", - " group_by(across(all_of(c(agg_level, \"check_label\")))) %>%\n", - " summarise(\n", - " pct_coherent = mean(pct_coherent, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " group_by(across(all_of(agg_level))) %>%\n", - " mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>%\n", - " ungroup() %>%\n", - " mutate(!!agg_level := fct_reorder(.data[[agg_level]], median_coh))\n", - " \n", - " n_units <- n_distinct(df_year[[agg_level]])\n", - " plot_height <- max(6, 0.5 * n_units) # dynamically adjust height\n", - " agg_label <- if (agg_level == \"ADM1_NAME\") {\n", - " \"niveau administratif 1\"\n", - " } else if (agg_level == \"ADM2_NAME\") {\n", - " \"niveau administratif 2\"\n", - " } else {\n", - " agg_level # fallback, in case a different level is passed\n", - " }\n", - " \n", - " p <- ggplot(df_year, aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) +\n", - " geom_tile(color = \"white\", linewidth = 0.2) +\n", - " geom_text(aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " size = 5, fontface = \"bold\", color = \"white\") +\n", - " scale_fill_viridis(name = \"% cohérent\", limits = c(0, 100),\n", - " option = \"viridis\", direction = -1) +\n", - " labs(\n", - " title = paste0(\"Cohérence des données par \", agg_label, \" - \", selected_year),\n", - " x = \"Règle de cohérence\",\n", - " y = agg_label\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " axis.text.y = element_text(size = 12),\n", - " axis.text.x = element_text(size = 12, angle = 30, hjust = 1),\n", - " plot.title = element_text(size = 16, face = \"bold\", hjust = 0.5),\n", - " legend.title = element_text(size = 12),\n", - " legend.text = element_text(size = 10)\n", - " )\n", - " \n", - " # Adjust notebook display\n", - " options(repr.plot.width = 14, repr.plot.height = plot_height)\n", - " \n", - " # Save if filename is provided\n", - " if (!is.null(filename)) {\n", - " ggsave(filename = filename, plot = p,\n", - " width = 14, height = plot_height, dpi = 300,\n", - " limitsize = FALSE)\n", - " }\n", - " if (do_plot) { print(p) }\n", - " # return(p)\n", - "}" + "# Coherence heatmap helper loaded from utils/reporting_utils.R" ] }, { @@ -876,43 +738,7 @@ }, "outputs": [], "source": [ - "# Define function\n", - "plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) {\n", - " \n", - " # Check if column exists\n", - " if (!col_name %in% names(map_data)) {\n", - " stop(paste0(\"Column '\", col_name, \"' not found in the data!\"))\n", - " }\n", - " \n", - " # Default legend title if not provided\n", - " if (is.null(indicator_label)) {\n", - " indicator_label <- col_name\n", - " }\n", - " \n", - " ggplot(map_data) +\n", - " geom_sf(aes(fill = .data[[col_name]]), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis(\n", - " name = paste0(\"% cohérence\\n(\", indicator_label, \")\"),\n", - " option = \"magma\",\n", - " direction = -1,\n", - " limits = c(0, 100),\n", - " na.value = \"grey90\"\n", - " ) +\n", - " # facet_wrap(~ YEAR) +\n", - " facet_wrap(~ YEAR, drop = TRUE) +\n", - " labs(\n", - " title = \"Cohérence des données par niveau administratif 2 et par année\",\n", - " subtitle = paste(\"Indicateur :\", indicator_label),\n", - " caption = \"Source : DHIS2 données routinières\"\n", - " ) +\n", - " theme_minimal(base_size = 15) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " strip.text = element_text(size = 14, face = \"bold\"),\n", - " plot.title = element_text(size = 20, face = \"bold\"),\n", - " legend.position = \"right\"\n", - " )\n", - "}\n" + "# Coherence map helper loaded from utils/reporting_utils.R" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R new file mode 100644 index 0000000..8642d85 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R @@ -0,0 +1,45 @@ +# Shared bootstrap for outliers notebooks. +bootstrap_outliers_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R new file mode 100644 index 0000000..72f70e7 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R @@ -0,0 +1,39 @@ +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + dt[, MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + dt[, c("TO_IMPUTE") := NULL] + return(as.data.frame(data.table::copy(dt))) +} + +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R new file mode 100644 index 0000000..719f4f6 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R @@ -0,0 +1,124 @@ +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), + subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", + x = "Année", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 14) +} + +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + if (nrow(df_ind) == 0) { + return(NULL) + } + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Détection des valeurs aberrantes —", ind_name), + subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), + x = "District (ADM2)", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7)) +} + +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!agg_level %in% names(df)) { + stop(paste0("Aggregation level '", agg_level, "' not found in data!")) + } + df_year <- df %>% + dplyr::filter(YEAR == selected_year) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% + dplyr::summarise(pct_coherent = mean(pct_coherent, na.rm = TRUE), .groups = "drop") %>% + dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% + dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% + dplyr::ungroup() %>% + dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) + n_units <- dplyr::n_distinct(df_year[[agg_level]]) + plot_height <- max(6, 0.5 * n_units) + agg_label <- if (agg_level == "ADM1_NAME") "niveau administratif 1" else if (agg_level == "ADM2_NAME") "niveau administratif 2" else agg_level + p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + + ggplot2::geom_tile(color = "white", linewidth = 0.2) + + ggplot2::geom_text(ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), size = 5, fontface = "bold", color = "white") + + viridis::scale_fill_viridis(name = "% cohérent", limits = c(0, 100), option = "viridis", direction = -1) + + ggplot2::labs(title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), x = "Règle de cohérence", y = agg_label) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + axis.text.y = ggplot2::element_text(size = 12), + axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), + plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), + legend.title = ggplot2::element_text(size = 12), + legend.text = ggplot2::element_text(size = 10) + ) + options(repr.plot.width = 14, repr.plot.height = plot_height) + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = plot_height, dpi = 300, limitsize = FALSE) + } + if (do_plot) { + print(p) + } +} + +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!col_name %in% names(map_data)) { + stop(paste0("Column '", col_name, "' not found in the data!")) + } + if (is.null(indicator_label)) { + indicator_label <- col_name + } + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + + viridis::scale_fill_viridis( + name = paste0("% cohérence\n(", indicator_label, ")"), + option = "magma", + direction = -1, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::facet_wrap(~ YEAR, drop = TRUE) + + ggplot2::labs( + title = "Cohérence des données par niveau administratif 2 et par année", + subtitle = paste("Indicateur :", indicator_label), + caption = "Source : DHIS2 données routinières" + ) + + ggplot2::theme_minimal(base_size = 15) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + strip.text = ggplot2::element_text(size = 14, face = "bold"), + plot.title = ggplot2::element_text(size = 20, face = "bold"), + legend.position = "right" + ) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb index 90275b5..4c81480 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb @@ -59,24 +59,22 @@ "source": [ "# Project folders (ROOT_PATH injected by pipeline if not set)\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_median\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c( \"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Pipeline-specific helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" ] }, { @@ -120,14 +118,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" ] }, @@ -533,23 +525,8 @@ }, "outputs": [], "source": [ - "# Define helper function to compute moving average for an outlier column\n", - "start_time <- Sys.time()\n", - "\n", - "impute_outliers_dt <- function(dt, outlier_col) {\n", - " dt <- as.data.table(dt) # transform to datatable\n", - " setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) \n", - " dt[, TO_IMPUTE := fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] # Compute TO_IMPUTE column\n", - " \n", - " # Fast rolling mean by group\n", - " dt[, MOVING_AVG := frollapply(TO_IMPUTE, n = 3, FUN = function(x) ceiling(mean(x, na.rm = TRUE)), align = \"center\"), \n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)]\n", - " \n", - " dt[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] \n", - " dt[, c(\"TO_IMPUTE\") := NULL] # clean up \"MOVING_AVG\"\n", - " \n", - " return(as.data.frame(copy(dt)))\n", - "}" + "# Helper loaded from utils/imputation_utils.R\n", + "start_time <- Sys.time()" ] }, { @@ -626,24 +603,7 @@ }, "outputs": [], "source": [ - "# Define helper function to format both versions \n", - "format_routine_data_selection <- function(df, outlier_column, remove = FALSE) {\n", - " \n", - " # remove outliers \n", - " if (remove) df <- df %>% filter(!.data[[outlier_column]])\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", DHIS2_INDICATORS)\n", - " \n", - " output <- df %>%\n", - " select(-VALUE) %>%\n", - " rename(VALUE = VALUE_IMPUTED) %>%\n", - " select(all_of(fixed_cols), INDICATOR, VALUE) %>% # global: fixed_cols\n", - " mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>%\n", - " pivot_wider(names_from = \"INDICATOR\", values_from = \"VALUE\") %>%\n", - " left_join(pyramid_names, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"))\n", - "\n", - " output %>% select(all_of(intersect(target_cols, names(output))))\n", - "}" + "# Helper loaded from utils/imputation_utils.R" ] }, { @@ -658,8 +618,22 @@ "outputs": [], "source": [ "# Format median tables (imputed and removed)\n", - "dhis2_routine_median_imputed <- format_routine_data_selection(dhis2_routine_outliers_median_imputed, median_column)\n", - "dhis2_routine_median_removed <- format_routine_data_selection(dhis2_routine_outliers_median_imputed, median_column, remove = TRUE)" + "dhis2_routine_median_imputed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_median_imputed,\n", + " outlier_column = median_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names\n", + ")\n", + "\n", + "dhis2_routine_median_removed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_median_imputed,\n", + " outlier_column = median_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names,\n", + " remove = TRUE\n", + ")" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb index 4705ad1..04b02b6 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb @@ -35,22 +35,21 @@ "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_median\")\n", "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = SNT_ROOT_PATH,\n", + " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + ")\n", "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Reporting helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" ] }, { @@ -64,13 +63,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "\n", "# Configuration variables\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", @@ -94,10 +88,7 @@ }, "outputs": [], "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" + "# Helper loaded from utils/reporting_utils.R" ] }, { @@ -231,76 +222,9 @@ }, "outputs": [], "source": [ - "#--- FUNCTIONS TO MAKE ONE PLOT ---\n", - "plot_outliers <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>% filter(INDICATOR == ind_name)\n", - "\n", - " # Remove infinite or impossible values explicitly → removes warnings\n", - " df_ind <- df_ind %>% \n", - " filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE))\n", - "\n", - " p <- ggplot(df_ind, aes(x = YEAR, y = VALUE)) +\n", - " \n", - " # All values (grey)\n", - " geom_point(alpha = 0.25, color = \"grey40\", na.rm = TRUE) +\n", - " \n", - " # Outliers (red)\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " aes(x = YEAR, y = VALUE),\n", - " color = \"red\",\n", - " size = 2.8,\n", - " alpha = 0.85,\n", - " na.rm = TRUE\n", - " ) +\n", - " \n", - " labs(\n", - " title = paste(\"Inspection des valeurs aberrantes pour indicateur:\", ind_name),\n", - " subtitle = \"Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées\",\n", - " x = \"Année\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 14)\n", - "\n", - " return(p)\n", - "}\n", - "\n", - "#plots <- map(unique_inds, ~ plot_outliers(.x, routine_data, outlier_col))\n", - "#walk(plots, print)\n", - "\n", - "plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>%\n", - " filter(\n", - " INDICATOR == ind_name,\n", - " !is.na(YEAR),\n", - " !is.na(VALUE),\n", - " is.finite(VALUE)\n", - " )\n", - " \n", - " if (nrow(df_ind) == 0) return(NULL)\n", - " \n", - " ggplot(df_ind, aes(x = ADM2_ID, y = VALUE)) +\n", - " geom_point(color = \"grey60\", alpha = 0.3) +\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " color = \"red\", \n", - " size = 2.8,\n", - " alpha = 0.85\n", - " ) +\n", - " facet_wrap(~ YEAR, scales = \"free_y\") +\n", - " labs(\n", - " title = paste(\"Détection des valeurs aberrantes —\", ind_name),\n", - " subtitle = paste(\"Méthode :\", outlier_col, \"| Rouge = valeur aberrante\"),\n", - " x = \"District (ADM2)\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 75, hjust = 1, size = 7)\n", - " )\n", - "}" + "# Plot helpers loaded from utils/reporting_utils.R\n", + "# - plot_outliers()\n", + "# - plot_outliers_by_district_facet_year()" ] }, { @@ -747,69 +671,7 @@ }, "outputs": [], "source": [ - "# Define heatmap function\n", - "plot_coherence_heatmap <- function(df, selected_year, agg_level = \"ADM1_NAME\", filename = NULL, do_plot = TRUE) {\n", - " \n", - " if (!agg_level %in% names(df)) {\n", - " stop(paste0(\"Aggregation level '\", agg_level, \"' not found in data!\"))\n", - " }\n", - " \n", - " # Aggregate pct_coherent by chosen level + check_label\n", - " df_year <- df %>%\n", - " filter(YEAR == selected_year) %>%\n", - " group_by(across(all_of(c(agg_level, \"check_label\")))) %>%\n", - " summarise(\n", - " pct_coherent = mean(pct_coherent, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " group_by(across(all_of(agg_level))) %>%\n", - " mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>%\n", - " ungroup() %>%\n", - " mutate(!!agg_level := fct_reorder(.data[[agg_level]], median_coh))\n", - " \n", - " n_units <- n_distinct(df_year[[agg_level]])\n", - " plot_height <- max(6, 0.5 * n_units) # dynamically adjust height\n", - " agg_label <- if (agg_level == \"ADM1_NAME\") {\n", - " \"niveau administratif 1\"\n", - " } else if (agg_level == \"ADM2_NAME\") {\n", - " \"niveau administratif 2\"\n", - " } else {\n", - " agg_level # fallback, in case a different level is passed\n", - " }\n", - " \n", - " p <- ggplot(df_year, aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) +\n", - " geom_tile(color = \"white\", linewidth = 0.2) +\n", - " geom_text(aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " size = 5, fontface = \"bold\", color = \"white\") +\n", - " scale_fill_viridis(name = \"% cohérent\", limits = c(0, 100),\n", - " option = \"viridis\", direction = -1) +\n", - " labs(\n", - " title = paste0(\"Cohérence des données par \", agg_label, \" - \", selected_year),\n", - " x = \"Règle de cohérence\",\n", - " y = agg_label\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " axis.text.y = element_text(size = 12),\n", - " axis.text.x = element_text(size = 12, angle = 30, hjust = 1),\n", - " plot.title = element_text(size = 16, face = \"bold\", hjust = 0.5),\n", - " legend.title = element_text(size = 12),\n", - " legend.text = element_text(size = 10)\n", - " )\n", - " \n", - " # Adjust notebook display\n", - " options(repr.plot.width = 14, repr.plot.height = plot_height)\n", - " \n", - " # Save if filename is provided\n", - " if (!is.null(filename)) {\n", - " ggsave(filename = filename, plot = p,\n", - " width = 14, height = plot_height, dpi = 300,\n", - " limitsize = FALSE)\n", - " }\n", - " if (do_plot) { print(p) }\n", - " # return(p)\n", - "}" + "# Coherence heatmap helper loaded from utils/reporting_utils.R" ] }, { @@ -876,43 +738,7 @@ }, "outputs": [], "source": [ - "# Define function\n", - "plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) {\n", - " \n", - " # Check if column exists\n", - " if (!col_name %in% names(map_data)) {\n", - " stop(paste0(\"Column '\", col_name, \"' not found in the data!\"))\n", - " }\n", - " \n", - " # Default legend title if not provided\n", - " if (is.null(indicator_label)) {\n", - " indicator_label <- col_name\n", - " }\n", - " \n", - " ggplot(map_data) +\n", - " geom_sf(aes(fill = .data[[col_name]]), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis(\n", - " name = paste0(\"% cohérence\\n(\", indicator_label, \")\"),\n", - " option = \"magma\",\n", - " direction = -1,\n", - " limits = c(0, 100),\n", - " na.value = \"grey90\"\n", - " ) +\n", - " # facet_wrap(~ YEAR) +\n", - " facet_wrap(~ YEAR, drop = TRUE) +\n", - " labs(\n", - " title = \"Cohérence des données par niveau administratif 2 et par année\",\n", - " subtitle = paste(\"Indicateur :\", indicator_label),\n", - " caption = \"Source : DHIS2 données routinières\"\n", - " ) +\n", - " theme_minimal(base_size = 15) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " strip.text = element_text(size = 14, face = \"bold\"),\n", - " plot.title = element_text(size = 20, face = \"bold\"),\n", - " legend.position = \"right\"\n", - " )\n", - "}\n" + "# Coherence map helper loaded from utils/reporting_utils.R" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R new file mode 100644 index 0000000..8642d85 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R @@ -0,0 +1,45 @@ +# Shared bootstrap for outliers notebooks. +bootstrap_outliers_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R new file mode 100644 index 0000000..72f70e7 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R @@ -0,0 +1,39 @@ +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + dt[, MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + dt[, c("TO_IMPUTE") := NULL] + return(as.data.frame(data.table::copy(dt))) +} + +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R new file mode 100644 index 0000000..719f4f6 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R @@ -0,0 +1,124 @@ +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), + subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", + x = "Année", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 14) +} + +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + if (nrow(df_ind) == 0) { + return(NULL) + } + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Détection des valeurs aberrantes —", ind_name), + subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), + x = "District (ADM2)", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7)) +} + +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!agg_level %in% names(df)) { + stop(paste0("Aggregation level '", agg_level, "' not found in data!")) + } + df_year <- df %>% + dplyr::filter(YEAR == selected_year) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% + dplyr::summarise(pct_coherent = mean(pct_coherent, na.rm = TRUE), .groups = "drop") %>% + dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% + dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% + dplyr::ungroup() %>% + dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) + n_units <- dplyr::n_distinct(df_year[[agg_level]]) + plot_height <- max(6, 0.5 * n_units) + agg_label <- if (agg_level == "ADM1_NAME") "niveau administratif 1" else if (agg_level == "ADM2_NAME") "niveau administratif 2" else agg_level + p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + + ggplot2::geom_tile(color = "white", linewidth = 0.2) + + ggplot2::geom_text(ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), size = 5, fontface = "bold", color = "white") + + viridis::scale_fill_viridis(name = "% cohérent", limits = c(0, 100), option = "viridis", direction = -1) + + ggplot2::labs(title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), x = "Règle de cohérence", y = agg_label) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + axis.text.y = ggplot2::element_text(size = 12), + axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), + plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), + legend.title = ggplot2::element_text(size = 12), + legend.text = ggplot2::element_text(size = 10) + ) + options(repr.plot.width = 14, repr.plot.height = plot_height) + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = plot_height, dpi = 300, limitsize = FALSE) + } + if (do_plot) { + print(p) + } +} + +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!col_name %in% names(map_data)) { + stop(paste0("Column '", col_name, "' not found in the data!")) + } + if (is.null(indicator_label)) { + indicator_label <- col_name + } + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + + viridis::scale_fill_viridis( + name = paste0("% cohérence\n(", indicator_label, ")"), + option = "magma", + direction = -1, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::facet_wrap(~ YEAR, drop = TRUE) + + ggplot2::labs( + title = "Cohérence des données par niveau administratif 2 et par année", + subtitle = paste("Indicateur :", indicator_label), + caption = "Source : DHIS2 données routinières" + ) + + ggplot2::theme_minimal(base_size = 15) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + strip.text = ggplot2::element_text(size = 14, face = "bold"), + plot.title = ggplot2::element_text(size = 20, face = "bold"), + legend.position = "right" + ) +} From 068efb9205c578e3d06304bcdbf2d6ac345bb101 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 24 Mar 2026 15:45:21 +0100 Subject: [PATCH 03/17] Refactor Magic Glasses notebooks to utilize a shared bootstrap context for configuration and package management. Introduced utility functions for outlier detection, enhancing modularity and readability. Updated paths for loading configurations and improved error handling in reporting notebooks. --- ...s2_outliers_imputation_magic_glasses.ipynb | 178 +++++------------- ...iers_imputation_magic_glasses_report.ipynb | 92 ++++----- .../utils/bootstrap.R | 28 +++ .../utils/magic_glasses_utils.R | 97 ++++++++++ 4 files changed, 223 insertions(+), 172 deletions(-) create mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index 60cfa92..c3b5b3a 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "4e1fe23c", "metadata": {}, "source": [ "# Outliers Detection - Magic Glasses (MG)\n", @@ -10,18 +9,16 @@ "Dedicated notebook for MG outlier detection only:\n", "- `OUTLIER_MAGIC_GLASSES_PARTIAL` (MAD15 -> MAD10)\n", "- `OUTLIER_MAGIC_GLASSES_COMPLETE` (MAD15 -> MAD10 -> seasonal5 -> seasonal3)" - ] + ], + "id": "4e1fe23c" }, { "cell_type": "code", - "execution_count": null, - "id": "1ddc1fb2", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Parameters with safe defaults\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", @@ -50,20 +47,21 @@ "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", "OUTPUT_DIR <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", "dir.create(OUTPUT_DIR, recursive = TRUE, showWarnings = FALSE)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1ddc1fb2" }, { "cell_type": "code", - "execution_count": null, - "id": "c91aab68", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", "\n", "required_packages <- c(\"arrow\", \"data.table\", \"jsonlite\", \"reticulate\", \"glue\")\n", "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", @@ -72,10 +70,18 @@ "if (RUN_MAGIC_GLASSES_COMPLETE && SEASONAL_WORKERS > 1) {\n", " required_packages <- c(required_packages, \"future\", \"future.apply\")\n", "}\n", - "install_and_load(unique(required_packages))\n", "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", + "setup_ctx <- bootstrap_magic_glasses_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = required_packages\n", + ")\n", + "\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "openhexa <- setup_ctx$openhexa\n", + "\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"magic_glasses_utils.R\"))\n", "\n", "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", " log_msg(\"[WARNING] Complete mode: seasonal detection is very computationally intensive and can take several hours to run.\", \"warning\")\n", @@ -121,104 +127,34 @@ "} else {\n", " log_msg(\"Partial mode active: seasonal detection is skipped.\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "c91aab68" }, { "cell_type": "code", - "execution_count": null, - "id": "652f7a2e", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ - "detect_outliers_mad_custom <- function(dt, deviation) {\n", - " flag_col <- paste0(\"OUTLIER_MAD\", deviation)\n", - " dt <- copy(dt)\n", - " dt[, median_val := median(VALUE, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)]\n", - " dt[, mad_val := mad(VALUE, constant = 1, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)]\n", - " dt[, (flag_col) := (VALUE > (median_val + deviation * mad_val)) | (VALUE < (median_val - deviation * mad_val))]\n", - " dt[is.na(get(flag_col)), (flag_col) := FALSE]\n", - " dt[, c(\"median_val\", \"mad_val\") := NULL]\n", - " dt\n", - "}\n", - "\n", - "detect_seasonal_outliers <- function(dt, deviation, workers = 1) {\n", - " outlier_col <- paste0(\"OUTLIER_SEASONAL\", deviation)\n", - " dt <- copy(dt)\n", - " setorder(dt, OU_ID, INDICATOR, PERIOD)\n", - "\n", - " process_group <- function(sub_dt) {\n", - " n_valid <- sum(!is.na(sub_dt$VALUE))\n", - " if (n_valid < 2) {\n", - " return(data.table(\n", - " PERIOD = sub_dt$PERIOD,\n", - " OU_ID = sub_dt$OU_ID,\n", - " INDICATOR = sub_dt$INDICATOR,\n", - " OUTLIER_FLAG = rep(FALSE, nrow(sub_dt))\n", - " ))\n", - " }\n", - "\n", - " values <- as.numeric(sub_dt$VALUE)\n", - " ts_data <- stats::ts(values, frequency = 12)\n", - " cleaned_ts <- tryCatch(\n", - " forecast::tsclean(ts_data, replace.missing = TRUE),\n", - " error = function(e) ts_data\n", - " )\n", - " mad_val <- mad(values, constant = 1, na.rm = TRUE)\n", - "\n", - " if (is.na(mad_val) || mad_val == 0) {\n", - " return(data.table(\n", - " PERIOD = sub_dt$PERIOD,\n", - " OU_ID = sub_dt$OU_ID,\n", - " INDICATOR = sub_dt$INDICATOR,\n", - " OUTLIER_FLAG = rep(FALSE, nrow(sub_dt))\n", - " ))\n", - " }\n", - "\n", - " is_outlier <- abs(as.numeric(ts_data) - as.numeric(cleaned_ts)) / mad_val >= deviation\n", - " is_outlier[is.na(is_outlier)] <- FALSE\n", - "\n", - " data.table(\n", - " PERIOD = sub_dt$PERIOD,\n", - " OU_ID = sub_dt$OU_ID,\n", - " INDICATOR = sub_dt$INDICATOR,\n", - " OUTLIER_FLAG = as.logical(is_outlier)\n", - " )\n", - " }\n", - "\n", - " group_keys <- unique(dt[, .(OU_ID, INDICATOR)])\n", - " group_list <- lapply(seq_len(nrow(group_keys)), function(i) {\n", - " dt[OU_ID == group_keys$OU_ID[i] & INDICATOR == group_keys$INDICATOR[i]]\n", - " })\n", - "\n", - " if (workers > 1 && requireNamespace(\"future.apply\", quietly = TRUE)) {\n", - " result_list <- future.apply::future_lapply(group_list, process_group, future.seed = TRUE)\n", - " } else {\n", - " result_list <- lapply(group_list, process_group)\n", - " }\n", - "\n", - " outlier_flags <- rbindlist(result_list, use.names = TRUE)\n", - " setnames(outlier_flags, \"OUTLIER_FLAG\", outlier_col)\n", - "\n", - " result_dt <- merge(dt, outlier_flags, by = c(\"PERIOD\", \"OU_ID\", \"INDICATOR\"), all.x = TRUE)\n", - " result_dt[is.na(get(outlier_col)), (outlier_col) := FALSE]\n", - " result_dt\n", - "}" - ] + "# Helpers loaded from utils/magic_glasses_utils.R\n", + "# - detect_outliers_mad_custom()\n", + "# - detect_seasonal_outliers()" + ], + "execution_count": null, + "outputs": [], + "id": "652f7a2e" }, { "cell_type": "code", - "execution_count": null, - "id": "1720f1f3", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "if (RUN_MAGIC_GLASSES_PARTIAL | RUN_MAGIC_GLASSES_COMPLETE) {\n", " log_msg(\"Starting MAD15 detection...\")\n", @@ -286,18 +222,18 @@ " flagged_outliers_seasonal5_seasonal3[is.na(OUTLIER_SEASONAL5_SEASONAL3), OUTLIER_SEASONAL5_SEASONAL3 := TRUE]\n", " log_msg(glue::glue(\"SEASONAL complete done: {sum(flagged_outliers_seasonal5_seasonal3$OUTLIER_SEASONAL5_SEASONAL3)} outliers flagged\"))\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1720f1f3" }, { "cell_type": "code", - "execution_count": null, - "id": "d6adc76b", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "base_cols <- intersect(c(fixed_cols, \"INDICATOR\", \"VALUE\"), names(dhis2_routine_long))\n", "flagged_outliers_mg <- copy(dhis2_routine_long[, ..base_cols])\n", @@ -354,31 +290,6 @@ "n_out <- sum(detected_tbl$OUTLIER_DETECTED == TRUE)\n", "log_msg(glue::glue(\"Exported full detection table ({nrow(detected_tbl)} rows, {n_out} outliers) to {COUNTRY_CODE}_routine_outliers_detected.parquet\"))\n", "\n", - "# Helper to restore routine dataset format (same structure as other outlier pipelines)\n", - "to_routine_wide <- function(dt_long) {\n", - " routine_wide <- dcast(\n", - " dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)],\n", - " PERIOD + YEAR + MONTH + ADM1_ID + ADM2_ID + OU_ID ~ INDICATOR,\n", - " value.var = \"VALUE\"\n", - " )\n", - "\n", - " routine_wide <- merge(routine_wide, unique(pyramid_names), by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"), all.x = TRUE)\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", indicators_to_keep)\n", - " for (col in setdiff(target_cols, names(routine_wide))) {\n", - " if (col %in% indicators_to_keep) {\n", - " routine_wide[, (col) := NA_real_]\n", - " } else if (col %in% c(\"YEAR\", \"MONTH\")) {\n", - " routine_wide[, (col) := NA_integer_]\n", - " } else {\n", - " routine_wide[, (col) := NA_character_]\n", - " }\n", - " }\n", - " cols_to_keep <- intersect(target_cols, names(routine_wide))\n", - " routine_wide <- routine_wide[, ..cols_to_keep]\n", - " routine_wide\n", - "}\n", - "\n", "# 2) Imputed routine data (same moving-average logic as other outlier pipelines)\n", "imputed_long <- copy(flagged_outliers_mg)\n", "setorder(imputed_long, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH)\n", @@ -396,7 +307,12 @@ "imputed_long[, VALUE := VALUE_IMPUTED]\n", "imputed_long[, c(\"TO_IMPUTE\", \"MOVING_AVG\", \"VALUE_IMPUTED\") := NULL]\n", "\n", - "routine_imputed <- to_routine_wide(imputed_long)\n", + "routine_imputed <- to_routine_wide(\n", + " dt_long = imputed_long,\n", + " fixed_cols = fixed_cols,\n", + " indicators_to_keep = indicators_to_keep,\n", + " pyramid_names = pyramid_names\n", + ")\n", "arrow::write_parquet(routine_imputed, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_imputed.parquet\")))\n", "log_msg(glue::glue(\"Exported routine imputed table to {COUNTRY_CODE}_routine_outliers_imputed.parquet\"))\n", "\n", @@ -405,12 +321,20 @@ "removed_long <- copy(flagged_outliers_mg)\n", "removed_long[get(active_outlier_col) == TRUE, VALUE := NA_real_]\n", "\n", - "routine_removed <- to_routine_wide(removed_long)\n", + "routine_removed <- to_routine_wide(\n", + " dt_long = removed_long,\n", + " fixed_cols = fixed_cols,\n", + " indicators_to_keep = indicators_to_keep,\n", + " pyramid_names = pyramid_names\n", + ")\n", "arrow::write_parquet(routine_removed, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_removed.parquet\")))\n", "log_msg(glue::glue(\"Exported routine removed table to {COUNTRY_CODE}_routine_outliers_removed.parquet\"))\n", "\n", "log_msg(\"MG outlier tables exported successfully.\")" - ] + ], + "execution_count": null, + "outputs": [], + "id": "d6adc76b" } ], "metadata": { @@ -430,4 +354,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb index 4751905..643cfa5 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "c254a43d", "metadata": {}, "source": [ "# MG Outliers Detection Report\n", @@ -13,31 +12,31 @@ "- **Méthode complète** : MAD15 → MAD10 → seasonal5 → seasonal3 (outliers complets).\n", "\n", "Les sections ci-dessous résument le nombre d’outliers détectés et leur répartition par indicateur et par année." - ] + ], + "id": "c254a43d" }, { "cell_type": "code", - "execution_count": null, - "id": "de9e854f", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Setup\n", "ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\", \"dhis2\", \"outliers_imputation\")\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "install_and_load(c(\"jsonlite\", \"arrow\", \"glue\", \"reticulate\", \"dplyr\", \"ggplot2\", \"knitr\", \"scales\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_magic_glasses_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"jsonlite\", \"arrow\", \"glue\", \"reticulate\", \"dplyr\", \"ggplot2\", \"knitr\", \"scales\")\n", + ")\n", "\n", - "# Align logging init with other production notebooks\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- file.path(setup_ctx$DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", + "openhexa <- setup_ctx$openhexa\n", "\n", "config_json <- fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", @@ -75,26 +74,26 @@ "} else {\n", " log_msg(\"MG detected outlier file not found.\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "de9e854f" }, { "cell_type": "markdown", - "id": "181df7b3", "metadata": {}, "source": [ "## 1. Résumé des outliers détectés" - ] + ], + "id": "181df7b3" }, { "cell_type": "code", - "execution_count": null, - "id": "f8c790fd", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Tableau récapitulatif (totaux)\n", "summary_totals <- data.frame(\n", @@ -105,26 +104,26 @@ " )\n", ")\n", "knitr::kable(summary_totals, format = \"simple\", col.names = c(\"Méthode\", \"Nombre d'outliers\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f8c790fd" }, { "cell_type": "markdown", - "id": "14eb3d09", "metadata": {}, "source": [ "## 2. Répartition par indicateur" - ] + ], + "id": "14eb3d09" }, { "cell_type": "code", - "execution_count": null, - "id": "71e65233", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Nombre d'outliers par indicateur (méthode partielle)\n", "if (!is.null(partial_tbl) && nrow(partial_tbl) > 0 && \"INDICATOR\" %in% names(partial_tbl)) {\n", @@ -136,18 +135,18 @@ "} else {\n", " cat(\"Aucune donnée partielle disponible.\\n\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "71e65233" }, { "cell_type": "code", - "execution_count": null, - "id": "3223c9b2", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Nombre d'outliers par indicateur (méthode complète)\n", "if (!is.null(complete_tbl) && nrow(complete_tbl) > 0 && \"INDICATOR\" %in% names(complete_tbl)) {\n", @@ -159,18 +158,18 @@ "} else {\n", " cat(\"Aucune donnée complète disponible (ou méthode complète non exécutée).\\n\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "3223c9b2" }, { "cell_type": "code", - "execution_count": null, - "id": "39239983", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Graphique : répartition par indicateur (partiel et/ou complet)\n", "plot_df <- NULL\n", @@ -204,26 +203,26 @@ "} else {\n", " cat(\"Aucune donnée à afficher.\\n\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "39239983" }, { "cell_type": "markdown", - "id": "28666b74", "metadata": {}, "source": [ "## 3. Évolution par année" - ] + ], + "id": "28666b74" }, { "cell_type": "code", - "execution_count": null, - "id": "1b1fcd6d", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Tableau : nombre d'outliers par année (partiel et/ou complet)\n", "year_tab <- NULL\n", @@ -239,18 +238,18 @@ "} else {\n", " cat(\"Aucune donnée par année disponible.\\n\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1b1fcd6d" }, { "cell_type": "code", - "execution_count": null, - "id": "1747b5b6", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Graphique : évolution du nombre d'outliers par année\n", "year_df <- NULL\n", @@ -281,7 +280,10 @@ "} else {\n", " cat(\"Aucune donnée par année à afficher.\\n\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1747b5b6" } ], "metadata": { @@ -301,4 +303,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R new file mode 100644 index 0000000..3e1d63d --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R @@ -0,0 +1,28 @@ +# Shared bootstrap for Magic Glasses notebooks. +bootstrap_magic_glasses_context <- function( + root_path = "~/workspace", + required_packages = c("arrow", "data.table", "jsonlite", "reticulate", "glue"), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(unique(required_packages)) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa + )) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R new file mode 100644 index 0000000..ce57811 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R @@ -0,0 +1,97 @@ +detect_outliers_mad_custom <- function(dt, deviation) { + flag_col <- paste0("OUTLIER_MAD", deviation) + dt <- data.table::copy(dt) + dt[, median_val := median(VALUE, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] + dt[, mad_val := mad(VALUE, constant = 1, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] + dt[, (flag_col) := (VALUE > (median_val + deviation * mad_val)) | (VALUE < (median_val - deviation * mad_val))] + dt[is.na(get(flag_col)), (flag_col) := FALSE] + dt[, c("median_val", "mad_val") := NULL] + dt +} + +detect_seasonal_outliers <- function(dt, deviation, workers = 1) { + outlier_col <- paste0("OUTLIER_SEASONAL", deviation) + dt <- data.table::copy(dt) + data.table::setorder(dt, OU_ID, INDICATOR, PERIOD) + + process_group <- function(sub_dt) { + n_valid <- sum(!is.na(sub_dt$VALUE)) + if (n_valid < 2) { + return(data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) + )) + } + + values <- as.numeric(sub_dt$VALUE) + ts_data <- stats::ts(values, frequency = 12) + cleaned_ts <- tryCatch( + forecast::tsclean(ts_data, replace.missing = TRUE), + error = function(e) ts_data + ) + mad_val <- mad(values, constant = 1, na.rm = TRUE) + + if (is.na(mad_val) || mad_val == 0) { + return(data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) + )) + } + + is_outlier <- abs(as.numeric(ts_data) - as.numeric(cleaned_ts)) / mad_val >= deviation + is_outlier[is.na(is_outlier)] <- FALSE + + data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = as.logical(is_outlier) + ) + } + + group_keys <- unique(dt[, .(OU_ID, INDICATOR)]) + group_list <- lapply(seq_len(nrow(group_keys)), function(i) { + dt[OU_ID == group_keys$OU_ID[i] & INDICATOR == group_keys$INDICATOR[i]] + }) + + if (workers > 1 && requireNamespace("future.apply", quietly = TRUE)) { + result_list <- future.apply::future_lapply(group_list, process_group, future.seed = TRUE) + } else { + result_list <- lapply(group_list, process_group) + } + + outlier_flags <- data.table::rbindlist(result_list, use.names = TRUE) + data.table::setnames(outlier_flags, "OUTLIER_FLAG", outlier_col) + + result_dt <- merge(dt, outlier_flags, by = c("PERIOD", "OU_ID", "INDICATOR"), all.x = TRUE) + result_dt[is.na(get(outlier_col)), (outlier_col) := FALSE] + result_dt +} + +to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_names) { + routine_wide <- data.table::dcast( + dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)], + PERIOD + YEAR + MONTH + ADM1_ID + ADM2_ID + OU_ID ~ INDICATOR, + value.var = "VALUE" + ) + + routine_wide <- merge(routine_wide, unique(pyramid_names), by = c("ADM1_ID", "ADM2_ID", "OU_ID"), all.x = TRUE) + + target_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", indicators_to_keep) + for (col in setdiff(target_cols, names(routine_wide))) { + if (col %in% indicators_to_keep) { + routine_wide[, (col) := NA_real_] + } else if (col %in% c("YEAR", "MONTH")) { + routine_wide[, (col) := NA_integer_] + } else { + routine_wide[, (col) := NA_character_] + } + } + cols_to_keep <- intersect(target_cols, names(routine_wide)) + routine_wide <- routine_wide[, ..cols_to_keep] + routine_wide +} From a2cb550bcfecb4dfa95cd90909bba5bf53af13cb Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 24 Mar 2026 17:32:13 +0100 Subject: [PATCH 04/17] Enhance SNT utility functions for configuration loading and data processing. Introduced error handling for configuration loading, validation of required keys, and standardized output path creation. Updated outlier imputation notebooks to utilize these new utilities, improving modularity and readability. --- code/snt_utils.r | 108 ++++++++++++++++++ ...s2_outliers_imputation_magic_glasses.ipynb | 46 ++++---- ...iers_imputation_magic_glasses_report.ipynb | 74 ++++++------ .../snt_dhis2_outliers_imputation_mean.ipynb | 67 +++++------ 4 files changed, 199 insertions(+), 96 deletions(-) diff --git a/code/snt_utils.r b/code/snt_utils.r index 8efda14..838e276 100644 --- a/code/snt_utils.r +++ b/code/snt_utils.r @@ -58,6 +58,114 @@ install_and_load <- function(packages) { print(loaded_packages) } +# Load SNT configuration file with a consistent error message +load_snt_config <- function(config_path, config_file_name = "SNT_config.json") { + config_file <- file.path(config_path, config_file_name) + config_json <- tryCatch( + { + jsonlite::fromJSON(config_file) + }, + error = function(e) { + msg <- paste0("[ERROR] Error while loading configuration ", conditionMessage(e)) + stop(msg) + } + ) + return(config_json) +} + +# Validate that required keys exist in a config section +validate_required_config_keys <- function(config_json, keys, section = "SNT_CONFIG") { + if (is.null(config_json[[section]])) { + stop(paste0("[ERROR] Missing configuration section: ", section)) + } + + missing_keys <- keys[!keys %in% names(config_json[[section]])] + if (length(missing_keys) > 0) { + stop(paste0("[ERROR] Missing configuration input(s): ", paste(missing_keys, collapse = ", "))) + } + + invisible(TRUE) +} + +# Generic helper to load a country-specific dataset file +load_country_file_from_dataset <- function(dataset_id, country_code, suffix, label = NULL) { + file_name <- paste0(country_code, suffix) + output_data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_id, file_name) + }, + error = function(e) { + target_label <- if (is.null(label)) file_name else label + msg <- paste0( + "[ERROR] Error while loading ", + target_label, + " (dataset: ", + dataset_id, + ", file: ", + file_name, + "): ", + conditionMessage(e) + ) + stop(msg) + } + ) + + log_msg(paste0("Loaded file `", file_name, "` from dataset `", dataset_id, "`.")) + return(output_data) +} + +# Ensure YEAR and MONTH are stored as integers when present +normalize_year_month_types <- function(input_df, year_col = "YEAR", month_col = "MONTH") { + output_df <- input_df + if (year_col %in% names(output_df)) { + output_df[[year_col]] <- as.integer(output_df[[year_col]]) + } + if (month_col %in% names(output_df)) { + output_df[[month_col]] <- as.integer(output_df[[month_col]]) + } + return(output_df) +} + +# Standard routine preparation: select, pivot longer, optional deduplication +prepare_routine_long <- function(routine_df, fixed_cols, indicators, deduplicate = TRUE) { + cols_to_select <- intersect(c(fixed_cols, indicators), names(routine_df)) + missing_indicators <- setdiff(indicators, names(routine_df)) + if (length(missing_indicators) > 0) { + stop(paste0("[ERROR] Missing indicator column(s): ", paste(missing_indicators, collapse = ", "))) + } + + routine_long <- routine_df %>% + dplyr::select(dplyr::all_of(cols_to_select)) %>% + tidyr::pivot_longer( + cols = dplyr::all_of(indicators), + names_to = "INDICATOR", + values_to = "VALUE" + ) + + if (deduplicate) { + dedup_keys <- intersect(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "YEAR", "MONTH", "INDICATOR"), names(routine_long)) + routine_long <- routine_long %>% + dplyr::distinct(dplyr::across(dplyr::all_of(dedup_keys)), .keep_all = TRUE) + } + + return(routine_long) +} + +# Build a standardized output path under /data and create it if needed +standard_output_path <- function(data_root_path, domain, subdomain = NULL, create_dir = TRUE) { + target_path <- if (is.null(subdomain) || nchar(subdomain) == 0) { + file.path(data_root_path, domain) + } else { + file.path(data_root_path, domain, subdomain) + } + + if (create_dir && !dir.exists(target_path)) { + dir.create(target_path, recursive = TRUE, showWarnings = FALSE) + } + + return(target_path) +} + # Helper to safely extract values from parameters (allows to specify the type) get_param <- function(params_list, target_param, default, cast_method = identity) { #' Safely retrieve a parameter if it exists in the input, using a default fallback if it doesn't exist in the inupt diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index c3b5b3a..78a0083 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "4e1fe23c", "metadata": {}, "source": [ "# Outliers Detection - Magic Glasses (MG)\n", @@ -9,16 +10,18 @@ "Dedicated notebook for MG outlier detection only:\n", "- `OUTLIER_MAGIC_GLASSES_PARTIAL` (MAD15 -> MAD10)\n", "- `OUTLIER_MAGIC_GLASSES_COMPLETE` (MAD15 -> MAD10 -> seasonal5 -> seasonal3)" - ], - "id": "4e1fe23c" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1ddc1fb2", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Parameters with safe defaults\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", @@ -47,18 +50,18 @@ "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", "OUTPUT_DIR <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", "dir.create(OUTPUT_DIR, recursive = TRUE, showWarnings = FALSE)" - ], - "execution_count": null, - "outputs": [], - "id": "1ddc1fb2" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c91aab68", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", @@ -127,34 +130,34 @@ "} else {\n", " log_msg(\"Partial mode active: seasonal detection is skipped.\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "c91aab68" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "652f7a2e", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Helpers loaded from utils/magic_glasses_utils.R\n", "# - detect_outliers_mad_custom()\n", "# - detect_seasonal_outliers()" - ], - "execution_count": null, - "outputs": [], - "id": "652f7a2e" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1720f1f3", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "if (RUN_MAGIC_GLASSES_PARTIAL | RUN_MAGIC_GLASSES_COMPLETE) {\n", " log_msg(\"Starting MAD15 detection...\")\n", @@ -222,18 +225,18 @@ " flagged_outliers_seasonal5_seasonal3[is.na(OUTLIER_SEASONAL5_SEASONAL3), OUTLIER_SEASONAL5_SEASONAL3 := TRUE]\n", " log_msg(glue::glue(\"SEASONAL complete done: {sum(flagged_outliers_seasonal5_seasonal3$OUTLIER_SEASONAL5_SEASONAL3)} outliers flagged\"))\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "1720f1f3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "d6adc76b", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "base_cols <- intersect(c(fixed_cols, \"INDICATOR\", \"VALUE\"), names(dhis2_routine_long))\n", "flagged_outliers_mg <- copy(dhis2_routine_long[, ..base_cols])\n", @@ -331,10 +334,7 @@ "log_msg(glue::glue(\"Exported routine removed table to {COUNTRY_CODE}_routine_outliers_removed.parquet\"))\n", "\n", "log_msg(\"MG outlier tables exported successfully.\")" - ], - "execution_count": null, - "outputs": [], - "id": "d6adc76b" + ] } ], "metadata": { @@ -354,4 +354,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb index 643cfa5..3ceddcb 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "c254a43d", "metadata": {}, "source": [ "# MG Outliers Detection Report\n", @@ -12,16 +13,18 @@ "- **Méthode complète** : MAD15 → MAD10 → seasonal5 → seasonal3 (outliers complets).\n", "\n", "Les sections ci-dessous résument le nombre d’outliers détectés et leur répartition par indicateur et par année." - ], - "id": "c254a43d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "de9e854f", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Setup\n", "ROOT_PATH <- \"~/workspace\"\n", @@ -74,26 +77,26 @@ "} else {\n", " log_msg(\"MG detected outlier file not found.\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "de9e854f" + ] }, { "cell_type": "markdown", + "id": "181df7b3", "metadata": {}, "source": [ "## 1. Résumé des outliers détectés" - ], - "id": "181df7b3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f8c790fd", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Tableau récapitulatif (totaux)\n", "summary_totals <- data.frame(\n", @@ -104,26 +107,26 @@ " )\n", ")\n", "knitr::kable(summary_totals, format = \"simple\", col.names = c(\"Méthode\", \"Nombre d'outliers\"))" - ], - "execution_count": null, - "outputs": [], - "id": "f8c790fd" + ] }, { "cell_type": "markdown", + "id": "14eb3d09", "metadata": {}, "source": [ "## 2. Répartition par indicateur" - ], - "id": "14eb3d09" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "71e65233", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Nombre d'outliers par indicateur (méthode partielle)\n", "if (!is.null(partial_tbl) && nrow(partial_tbl) > 0 && \"INDICATOR\" %in% names(partial_tbl)) {\n", @@ -135,18 +138,18 @@ "} else {\n", " cat(\"Aucune donnée partielle disponible.\\n\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "71e65233" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "3223c9b2", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Nombre d'outliers par indicateur (méthode complète)\n", "if (!is.null(complete_tbl) && nrow(complete_tbl) > 0 && \"INDICATOR\" %in% names(complete_tbl)) {\n", @@ -158,18 +161,18 @@ "} else {\n", " cat(\"Aucune donnée complète disponible (ou méthode complète non exécutée).\\n\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "3223c9b2" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "39239983", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Graphique : répartition par indicateur (partiel et/ou complet)\n", "plot_df <- NULL\n", @@ -203,26 +206,26 @@ "} else {\n", " cat(\"Aucune donnée à afficher.\\n\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "39239983" + ] }, { "cell_type": "markdown", + "id": "28666b74", "metadata": {}, "source": [ "## 3. Évolution par année" - ], - "id": "28666b74" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1b1fcd6d", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Tableau : nombre d'outliers par année (partiel et/ou complet)\n", "year_tab <- NULL\n", @@ -238,18 +241,18 @@ "} else {\n", " cat(\"Aucune donnée par année disponible.\\n\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "1b1fcd6d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1747b5b6", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Graphique : évolution du nombre d'outliers par année\n", "year_df <- NULL\n", @@ -280,10 +283,7 @@ "} else {\n", " cat(\"Aucune donnée par année à afficher.\\n\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "1747b5b6" + ] } ], "metadata": { @@ -303,4 +303,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index 2f73668..62c5d49 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -134,15 +134,12 @@ }, "outputs": [], "source": [ - "# Check SNT configuration \n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") \n", - "for (conf in snt_config_mandatory) {\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", + "# Check SNT configuration (shared helper)\n", + "validate_required_config_keys(\n", + " config_json = config_json,\n", + " keys = c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\"),\n", + " section = \"SNT_CONFIG\"\n", + ")\n", "\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", @@ -180,16 +177,15 @@ }, "outputs": [], "source": [ - "# Load file from dataset (formatting)\n", + "# Load file from dataset (formatting) using shared helper\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", + "dhis2_routine <- load_country_file_from_dataset(\n", + " dataset_id = dataset_name,\n", + " country_code = COUNTRY_CODE,\n", + " suffix = \"_routine.parquet\",\n", + " label = \"DHIS2 routine data\"\n", + ")\n", "\n", - "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", "head(dhis2_routine, 2)" @@ -206,8 +202,9 @@ }, "outputs": [], "source": [ - "# YEAR and MONTH should be integers; in the input data they are numeric, but we later use them as integers\n", - "dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)" + "# YEAR and MONTH should be integers\n", + "# Use shared helper to normalize types when columns exist\n", + "dhis2_routine <- normalize_year_month_types(dhis2_routine)" ] }, { @@ -268,9 +265,13 @@ }, "outputs": [], "source": [ - "dhis2_routine_long <- dhis2_routine %>%\n", - " select(all_of(c(fixed_cols, DHIS2_INDICATORS))) %>%\n", - " pivot_longer(cols = all_of(DHIS2_INDICATORS), names_to = \"INDICATOR\", values_to = \"VALUE\")\n", + "# Use shared helper to select, pivot and deduplicate routine rows\n", + "dhis2_routine_long <- prepare_routine_long(\n", + " routine_df = dhis2_routine,\n", + " fixed_cols = fixed_cols,\n", + " indicators = DHIS2_INDICATORS,\n", + " deduplicate = TRUE\n", + ")\n", "\n", "print(dim(dhis2_routine_long))\n", "head(dhis2_routine_long, 2)" @@ -295,19 +296,8 @@ }, "outputs": [], "source": [ - "# check if there are any duplicates\n", - "duplicated <- dhis2_routine_long %>%\n", - " group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, YEAR, MONTH, INDICATOR) %>%\n", - " summarise(n = dplyr::n(), .groups= \"drop\") %>%\n", - " filter(n > 1L)\n", - "\n", - "# Remove dups\n", - "if (nrow(duplicated) > 0) {\n", - " log_msg(glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", - " dhis2_routine_long <- dhis2_routine_long %>%\n", - " distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, YEAR, MONTH, INDICATOR, .keep_all = TRUE)\n", - " head(duplicated)\n", - "}" + "# Duplicates are handled by prepare_routine_long(..., deduplicate = TRUE)\n", + "log_msg(\"Routine long data prepared with shared helper (deduplication applied).\")" ] }, { @@ -685,7 +675,12 @@ }, "outputs": [], "source": [ - "output_path <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", + "output_path <- standard_output_path(\n", + " data_root_path = DATA_PATH,\n", + " domain = \"dhis2\",\n", + " subdomain = \"outliers_imputation\",\n", + " create_dir = TRUE\n", + ")\n", "\n", "# Mean detection table (for DB and reporting)\n", "outlier_col <- colnames(dhis2_routine_outliers_selection)[startsWith(colnames(dhis2_routine_outliers_selection), \"OUTLIER_\")][1]\n", From b4bd49a3261838fe7ba775ba3eeb9daad142b73b Mon Sep 17 00:00:00 2001 From: claude-marie Date: Wed, 25 Mar 2026 10:10:06 +0100 Subject: [PATCH 05/17] Refactor outlier imputation and reporting notebooks to utilize new shared helper scripts. Removed the old bootstrap script and updated paths for loading configuration and utility functions, enhancing modularity and readability across the pipeline. --- .../snt_dhis2_outliers_imputation_iqr.ipynb | 13 ++--- ...dhis2_outliers_imputation_iqr_report.ipynb | 17 +++---- .../utils/bootstrap.R | 45 ------------------ ....R => snt_dhis2_outliers_imputation_iqr.r} | 46 ++++++++++++++++++ ...nt_dhis2_outliers_imputation_iqr_report.r} | 47 +++++++++++++++++++ 5 files changed, 105 insertions(+), 63 deletions(-) delete mode 100644 pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R rename pipelines/snt_dhis2_outliers_imputation_iqr/utils/{imputation_utils.R => snt_dhis2_outliers_imputation_iqr.r} (55%) rename pipelines/snt_dhis2_outliers_imputation_iqr/utils/{reporting_utils.R => snt_dhis2_outliers_imputation_iqr_report.r} (81%) diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb index dd66522..1731e12 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb @@ -61,8 +61,8 @@ "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_iqr\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (code)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_iqr.r\"))\n", "setup_ctx <- bootstrap_iqr_context(\n", " root_path = ROOT_PATH,\n", " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", @@ -71,10 +71,7 @@ "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Pipeline-specific helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { @@ -528,7 +525,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R\n", + "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr.r\n", "start_time <- Sys.time()" ] }, @@ -606,7 +603,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb index 2fea8de..871e3eb 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb @@ -37,8 +37,8 @@ "SNT_ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_iqr\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (report)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_iqr_report.r\"))\n", "setup_ctx <- bootstrap_iqr_context(\n", " root_path = SNT_ROOT_PATH,\n", " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", @@ -46,10 +46,7 @@ "\n", "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Reporting helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { @@ -88,7 +85,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/reporting_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" ] }, { @@ -222,7 +219,7 @@ }, "outputs": [], "source": [ - "# Plot helpers loaded from utils/reporting_utils.R\n", + "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r\n", "# - plot_outliers()\n", "# - plot_outliers_by_district_facet_year()" ] @@ -671,7 +668,7 @@ }, "outputs": [], "source": [ - "# Coherence heatmap helper loaded from utils/reporting_utils.R" + "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" ] }, { @@ -738,7 +735,7 @@ }, "outputs": [], "source": [ - "# Coherence map helper loaded from utils/reporting_utils.R" + "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R deleted file mode 100644 index 4f9e7b3..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R +++ /dev/null @@ -1,45 +0,0 @@ -# Shared bootstrap for the IQR outliers pipeline notebooks. -bootstrap_iqr_context <- function( - root_path = "~/workspace", - required_packages = c( - "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", - "reticulate", "glue", "zoo" - ), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - config_path <- file.path(root_path, "configuration") - data_path <- file.path(root_path, "data") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(required_packages) - - Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") - Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - - openhexa <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - } - - config_json <- tryCatch( - { - jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) - }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") - log_msg(msg) - stop(msg) - } - ) - - return(list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - CONFIG_PATH = config_path, - DATA_PATH = data_path, - openhexa = openhexa, - config_json = config_json - )) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r similarity index 55% rename from pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R rename to pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r index da3b1e9..35204c2 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r @@ -1,3 +1,49 @@ +# Shared bootstrap for the IQR outliers pipeline notebooks. +bootstrap_iqr_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} + # Compute moving-average imputations for a selected outlier flag column. impute_outliers_dt <- function(dt, outlier_col) { dt <- data.table::as.data.table(dt) diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r similarity index 81% rename from pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R rename to pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r index cc6c620..6625b45 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r @@ -1,3 +1,50 @@ +# Shared bootstrap for the IQR outliers reporting notebook. +bootstrap_iqr_context <- function( + root_path = "~/workspace", + required_packages = c( + "dplyr", "tidyr", "terra", "ggplot2", "stringr", "lubridate", + "viridis", "patchwork", "zoo", "scales", "purrr", "arrow", + "sf", "reticulate", "knitr", "glue", "forcats" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} + printdim <- function(df, name = deparse(substitute(df))) { cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") } From 9b838fa279499a9265e7644738aac4f1e77d8fb3 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Wed, 25 Mar 2026 12:07:04 +0100 Subject: [PATCH 06/17] Refactor IQR outliers reporting notebook and utility functions to enhance modularity and readability. Updated helper functions for plotting outliers and added error handling for missing data. Improved configuration loading and path management for better integration across the pipeline. --- ...dhis2_outliers_imputation_iqr_report.ipynb | 379 +++++--------- ...snt_dhis2_outliers_imputation_iqr_report.r | 494 +++++++++++------- 2 files changed, 425 insertions(+), 448 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb index 871e3eb..a927dd7 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb @@ -2,36 +2,33 @@ "cells": [ { "cell_type": "markdown", - "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", "metadata": {}, "source": [ "## **Détection des valeurs aberrantes — Méthode IQR**" - ] + ], + "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233" }, { "cell_type": "code", - "execution_count": null, - "id": "43794265-533f-4035-bf3d-975a3409507b", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "## CONFIGURATION ##" - ] + ], + "execution_count": null, + "outputs": [], + "id": "43794265-533f-4035-bf3d-975a3409507b" }, { "cell_type": "code", - "execution_count": null, - "id": "2ced7513-0ee6-4b9b-ac07-124e510119af", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", @@ -47,18 +44,18 @@ "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "openhexa <- setup_ctx$openhexa" - ] + ], + "execution_count": null, + "outputs": [], + "id": "2ced7513-0ee6-4b9b-ac07-124e510119af" }, { "cell_type": "code", - "execution_count": null, - "id": "9e6b91b3-c196-4a1f-bc3d-a4bec5b90e51", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Load SNT config from bootstrap context\n", "config_json <- setup_ctx$config_json\n", @@ -72,32 +69,32 @@ "ADM_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", "facility_level <- config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL" - ] + ], + "execution_count": null, + "outputs": [], + "id": "9e6b91b3-c196-4a1f-bc3d-a4bec5b90e51" }, { "cell_type": "code", - "execution_count": null, - "id": "f8edc2a5-07ce-4507-9939-4322fc510593", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f8edc2a5-07ce-4507-9939-4322fc510593" }, { "cell_type": "code", - "execution_count": null, - "id": "25362e00-96b5-4200-be45-cdeeff9ce3ac", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# import routine data\n", "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine_outliers_detected.parquet\")) }, \n", @@ -122,26 +119,26 @@ " })\n", "\n", "printdim(routine_data)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "25362e00-96b5-4200-be45-cdeeff9ce3ac" }, { "cell_type": "markdown", - "id": "c3cee574-8d66-4cd5-8fe6-97f39daa158b", "metadata": {}, "source": [ "### 1. Résumé des valeurs aberrantes détectées dans les données de routine" - ] + ], + "id": "c3cee574-8d66-4cd5-8fe6-97f39daa158b" }, { "cell_type": "code", - "execution_count": null, - "id": "5ef732f5-52a8-4abc-87ba-7ca77f6c85f2", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "outlier_flags <- routine_data %>%\n", " # Keep only OUTLIER_* columns that are logical flags.\n", @@ -178,26 +175,26 @@ "}\n", "\n", "outlier_summary_long" - ] + ], + "execution_count": null, + "outputs": [], + "id": "5ef732f5-52a8-4abc-87ba-7ca77f6c85f2" }, { "cell_type": "markdown", - "id": "35bcc286-cde1-47bd-99ab-3a6f6b39ac5d", "metadata": {}, "source": [ "### 2. Visualisation des valeurs aberrantes (méthode IQR)" - ] + ], + "id": "35bcc286-cde1-47bd-99ab-3a6f6b39ac5d" }, { "cell_type": "code", - "execution_count": null, - "id": "abeae17b-935d-49d9-a239-89985e469d81", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "#--- PARAMETERS ---\n", "outlier_cols <- if (\"OUTLIER_DETECTED\" %in% names(routine_data)) {\n", @@ -206,27 +203,29 @@ " routine_data %>% select(starts_with(\"OUTLIER_\")) %>% names()\n", "}\n", "print(outlier_cols)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "abeae17b-935d-49d9-a239-89985e469d81" }, { "cell_type": "code", - "execution_count": null, - "id": "99dd199e-bcf1-4900-b96e-f0f3285caec2", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r\n", "# - plot_outliers()\n", "# - plot_outliers_by_district_facet_year()" - ] + ], + "execution_count": null, + "outputs": [], + "id": "99dd199e-bcf1-4900-b96e-f0f3285caec2" }, { "cell_type": "markdown", - "id": "764e6f6a-f810-4077-8ed4-6d5b24c4caf4", "metadata": {}, "source": [ "### Include plots \n", @@ -234,18 +233,16 @@ "-Clean folder \n", "-Save Images \n", "-Load the images " - ] + ], + "id": "764e6f6a-f810-4077-8ed4-6d5b24c4caf4" }, { "cell_type": "code", - "execution_count": null, - "id": "8e4834fe-16d3-40ca-91a3-a38cd8e301d0", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Create folder if it doesn't exist\n", "output_dir <- file.path(getwd(), \"outputs/plots\")\n", @@ -255,18 +252,18 @@ " files <- list.files(output_dir, full.names = TRUE)\n", " if (length(files) > 0) file.remove(files)\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "8e4834fe-16d3-40ca-91a3-a38cd8e301d0" }, { "cell_type": "code", - "execution_count": null, - "id": "df34b8d6-489f-42d7-a9b3-3862edc6b780", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "selected_inds <- c(\"SUSP\", \"TEST\", \"CONF\")\n", "\n", @@ -288,19 +285,21 @@ "for (img in img_files) {\n", " IRdisplay::display_png(file = img)\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "df34b8d6-489f-42d7-a9b3-3862edc6b780" }, { "cell_type": "markdown", - "id": "1ad3de61-1ff3-430b-83d8-f1c9fc924b43", "metadata": {}, "source": [ "### 3. Cohérence des indicateurs au niveau nationale" - ] + ], + "id": "1ad3de61-1ff3-430b-83d8-f1c9fc924b43" }, { "cell_type": "markdown", - "id": "bd92f817-83a5-4597-b547-c9f0a506c08a", "metadata": {}, "source": [ "La section ci-dessous est un extrait des explications fournies par la **Community code library for SNT**. Veuillez consulter le site Web pour obtenir des explications complètes: https://ahadi-analytics.github.io/snt-code-library/english/library/data/routine_cases/quality_control.html#cb19-55\n", @@ -320,18 +319,16 @@ "**Décès toutes causes confondues ≥ décès dus au paludisme**: un rapport de 1:1 implique que tous les décès déclarés sont attribués au paludisme. Lorsque les décès toutes causes confondues sont inférieurs aux décès dus au paludisme, cela reflète des problèmes de qualité des données liés à une déclaration inadéquate de l'une ou des deux variables.\n", "\n", "**Admissions pour paludisme ≥ décès dus au paludisme**: un rapport de 1:1 implique que tous les cas de paludisme hospitalisés sont décédés, soit un taux de mortalité hospitalière de 100 %. Lorsque les admissions pour paludisme sont inférieures aux décès dus au paludisme, cela reflète des problèmes de qualité des données liés à une déclaration inadéquate de l'une ou des deux variables." - ] + ], + "id": "bd92f817-83a5-4597-b547-c9f0a506c08a" }, { "cell_type": "code", - "execution_count": null, - "id": "4ff0b337-2370-44ca-ac66-24068e7680c0", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Step 1: Extract year, month from PERIOD & aggregate\n", "routine_month <- routine_data_imputed %>%\n", @@ -348,18 +345,18 @@ " PRES = sum(PRES, na.rm = TRUE),\n", " .groups = \"drop\"\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "4ff0b337-2370-44ca-ac66-24068e7680c0" }, { "cell_type": "code", - "execution_count": null, - "id": "e183891f-c823-4fa2-9cbf-74143a8526fc", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Step 2: Plot monthly national trends\n", "options(repr.plot.width = 14, repr.plot.height = 6)\n", @@ -379,18 +376,18 @@ " legend.title = element_text(size = 16),\n", " legend.text = element_text(size = 16)\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "e183891f-c823-4fa2-9cbf-74143a8526fc" }, { "cell_type": "code", - "execution_count": null, - "id": "408b0396-346b-45a5-9358-6375400a6767", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Identify indicator columns automatically (all numeric except YEAR, MONTH, IDs)\n", "indicator_cols <- routine_data_imputed %>%\n", @@ -404,18 +401,18 @@ " ungroup()\n", "\n", "yearly_totals %>% select(YEAR, SUSP, TEST, CONF)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "408b0396-346b-45a5-9358-6375400a6767" }, { "cell_type": "code", - "execution_count": null, - "id": "54c415c8-8431-413c-9373-4d957776eff7", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Step 3: Create scatter plots\n", "routine_hd_month <- routine_data_imputed %>%\n", @@ -456,257 +453,138 @@ "\n", "# Step 3: Combine plots\n", "(p1 | p2 | p3) + plot_layout(guides = \"collect\")" - ] + ], + "execution_count": null, + "outputs": [], + "id": "54c415c8-8431-413c-9373-4d957776eff7" }, { "cell_type": "markdown", - "id": "61c5a165-af0e-45a4-9d2e-8901451a4d6b", "metadata": {}, "source": [ "Le graphique en bas montre le **pourcentage de rapports mensuels des formations sanitaires au niveau national** qui ont passé chaque contrôle de cohérence pour chaque année. Chaque cellule indique la proportion de rapports mensuels d’une année donnée qui respectent la règle de cohérence correspondante. Évaluer ces contrôles d’une année à l’autre et entre catégories permet d’identifier les **tendances générales de la qualité des données**." - ] + ], + "id": "61c5a165-af0e-45a4-9d2e-8901451a4d6b" }, { "cell_type": "code", - "execution_count": null, - "id": "4e530e1c-c6be-4de0-bc23-57ce1135ed59", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ - "# ---- 0. Define the checks, columns and labels ----\n", - "checks <- list(\n", - " allout_susp = c(\"ALLOUT\", \"SUSP\"), \n", - " allout_test = c(\"ALLOUT\", \"TEST\"), \n", - " susp_test = c(\"SUSP\", \"TEST\"), \n", - " test_conf = c(\"TEST\", \"CONF\"), \n", - " conf_treat = c(\"CONF\", \"MALTREAT\"), \n", - " adm_dth = c(\"MALADM\", \"MALDTH\") \n", - ")\n", - "\n", - "check_labels <- c(\n", - " pct_coherent_allout_susp = \"Ambulatoire ≥ Suspects\",\n", - " pct_coherent_allout_test = \"Ambulatoire ≥ Testés\",\n", - " pct_coherent_susp_test = \"Suspects ≥ Testés\",\n", - " pct_coherent_test_conf = \"Testés ≥ Confirmés\",\n", - " pct_coherent_conf_treat = \"Confirmés ≥ Traités\",\n", - " pct_coherent_adm_dth = \"Admissions Palu ≥ Décès Palu\"\n", - ")" - ] + "# Coherence definitions loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r\n", + "defs <- get_coherence_definitions()\n", + "checks <- defs$checks\n", + "check_labels <- defs$check_labels" + ], + "execution_count": null, + "outputs": [], + "id": "4e530e1c-c6be-4de0-bc23-57ce1135ed59" }, { "cell_type": "code", - "execution_count": null, - "id": "74f70f5c-a56d-4c47-bc5a-3cc828759d54", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency checks dynamically ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# ---- 2. Summarise percent coherent per year ----\n", - "check_cols <- intersect(paste0(\"check_\", names(checks)), names(df_checks))\n", - "\n", - "coherency_metrics <- df_checks %>%\n", - " group_by(YEAR) %>%\n", - " summarise(\n", - " across(all_of(check_cols), ~ mean(.x, na.rm = TRUE) * 100,\n", - " .names = \"pct_{.col}\"),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"check_type\",\n", - " names_prefix = \"pct_check_\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent)) %>% # <-- remove missing checks entirely\n", - " mutate(\n", - " check_label = recode(\n", - " check_type,\n", - " !!!setNames(check_labels, sub(\"^pct_coherent_\", \"\", names(check_labels)))\n", - " ),\n", - " check_label = factor(check_label, levels = unique(check_label)), # preserve only existing levels\n", - " check_label = fct_reorder(check_label, pct_coherent, .fun = median, na.rm = TRUE)\n", - " )\n", - "\n", - "# ---- 3. Heatmap ----\n", - "coherency_plot <- ggplot(coherency_metrics, aes(\n", - " x = factor(YEAR),\n", - " y = check_label,\n", - " fill = pct_coherent\n", - ")) +\n", - " geom_tile(color = NA, width = 0.88, height = 0.88) +\n", - " geom_text(\n", - " aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " color = \"white\",\n", - " fontface = \"bold\",\n", - " size = 5\n", - " ) +\n", - " scale_fill_viridis(\n", - " name = \"% Cohérent\",\n", - " option = \"viridis\",\n", - " limits = c(0, 100),\n", - " direction = -1\n", - " ) +\n", - " labs(\n", - " title = \"Contrôles de cohérence des données (niveau national)\",\n", - " x = \"Année\",\n", - " y = NULL\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " plot.title = element_text(size = 22, face = \"bold\", hjust = 0.5),\n", - " axis.text.y = element_text(size = 16, hjust = 0),\n", - " axis.text.x = element_text(size = 16),\n", - " legend.title = element_text(size = 16, face = \"bold\"),\n", - " legend.text = element_text(size = 14),\n", - " legend.key.width = unit(0.7, \"cm\"),\n", - " legend.key.height = unit(1.2, \"cm\")\n", - " )\n", + "# National coherence summary and plot via report utils\n", + "coherency_metrics <- compute_national_coherency_metrics(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels\n", + ")\n", "\n", + "coherency_plot <- plot_national_coherence_heatmap(coherency_metrics)\n", "coherency_plot" - ] + ], + "execution_count": null, + "outputs": [], + "id": "74f70f5c-a56d-4c47-bc5a-3cc828759d54" }, { "cell_type": "markdown", - "id": "da37c36d-bf7e-4ed7-8cfc-b377329b8d89", "metadata": {}, "source": [ "### 4. Visualisation de la cohérence au niveau du AMD1" - ] + ], + "id": "da37c36d-bf7e-4ed7-8cfc-b377329b8d89" }, { "cell_type": "code", - "execution_count": null, - "id": "ca2d499c-4f17-4ec0-a090-3ca75daa914e", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency check per row safely ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA_real_)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# Identify the check columns that actually exist\n", - "check_cols <- names(df_checks)[grepl(\"^check_\", names(df_checks))]\n", - "\n", - "valid_checks <- check_cols[\n", - " purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x)))\n", - "]\n", - "\n", - "# Compute coherence\n", - "adm_coherence <- df_checks %>%\n", - " group_by(ADM1_NAME, ADM2_NAME, ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " total_reports = n(),\n", - " !!!purrr::map(\n", - " valid_checks,\n", - " ~ expr(100 * mean(.data[[.x]], na.rm = TRUE))\n", - " ) %>%\n", - " setNames(paste0(\"pct_coherent_\", sub(\"^check_\", \"\", valid_checks))),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " filter(total_reports >= 5)\n", - "\n", - "# To long format\n", - "adm_long <- adm_coherence %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_coherent_\"),\n", - " names_to = \"check_type\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent))\n", + "# ADM coherence summaries via report utils\n", + "adm_coherence_data <- compute_adm_coherence_long(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels,\n", + " min_reports = 5\n", + ")\n", "\n", - "adm_long <- adm_long %>% mutate(check_label = recode(check_type, !!!check_labels))\n", + "adm_coherence <- adm_coherence_data$adm_coherence\n", + "adm_long <- adm_coherence_data$adm_long\n", "\n", "head(adm_long)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "ca2d499c-4f17-4ec0-a090-3ca75daa914e" }, { "cell_type": "code", - "execution_count": null, - "id": "c159c47c-a2b1-411a-98fb-50d5173ebd0b", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" - ] + ], + "execution_count": null, + "outputs": [], + "id": "c159c47c-a2b1-411a-98fb-50d5173ebd0b" }, { "cell_type": "code", - "execution_count": null, - "id": "f11eb536-b633-40fb-b182-f67ea51e0c66", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Plot per year\n", "years_available <- sort(unique(adm_long$YEAR))\n", "for (year in years_available) {\n", " plot_coherence_heatmap(df = adm_long, selected_year = year, agg_level = \"ADM1_NAME\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f11eb536-b633-40fb-b182-f67ea51e0c66" }, { "cell_type": "markdown", - "id": "76315fa7-2fa2-4ea2-baf2-bc427b3f659a", "metadata": {}, "source": [ "### 5. Visualisation de la cohérence au niveau du AMD2" - ] + ], + "id": "76315fa7-2fa2-4ea2-baf2-bc427b3f659a" }, { "cell_type": "code", - "execution_count": null, - "id": "8930acc3-aafa-4ecb-9a1d-a48fc2faf1cd", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "shapes_data <- shapes_data %>%\n", " mutate(ADM2_ID = as.character(ADM2_ID))\n", @@ -722,32 +600,32 @@ " ADM2_NAME_shape = ADM2_NAME.x,\n", " ADM2_NAME_data = ADM2_NAME.y\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "8930acc3-aafa-4ecb-9a1d-a48fc2faf1cd" }, { "cell_type": "code", - "execution_count": null, - "id": "002ca971-fdff-4e12-ad0f-5daae51ba126", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" - ] + ], + "execution_count": null, + "outputs": [], + "id": "002ca971-fdff-4e12-ad0f-5daae51ba126" }, { "cell_type": "code", - "execution_count": null, - "id": "88982e8e-43f4-4b6d-9f04-a61a03217c8d", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Loop over all available columns\n", "for (check_col in names(check_labels)) {\n", @@ -763,19 +641,22 @@ " # width = 14, height = 10, dpi = 300)\n", " }\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "88982e8e-43f4-4b6d-9f04-a61a03217c8d" }, { "cell_type": "code", - "execution_count": null, - "id": "5d0b89bb-53c8-4d0d-aa39-71341956f1d0", "metadata": { "vscode": { "languageId": "r" } }, + "source": [], + "execution_count": null, "outputs": [], - "source": [] + "id": "5d0b89bb-53c8-4d0d-aa39-71341956f1d0" } ], "metadata": { @@ -795,4 +676,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r index 6625b45..38de5d1 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r @@ -1,213 +1,309 @@ -# Shared bootstrap for the IQR outliers reporting notebook. -bootstrap_iqr_context <- function( - root_path = "~/workspace", - required_packages = c( - "dplyr", "tidyr", "terra", "ggplot2", "stringr", "lubridate", - "viridis", "patchwork", "zoo", "scales", "purrr", "arrow", - "sf", "reticulate", "knitr", "glue", "forcats" - ), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - config_path <- file.path(root_path, "configuration") - data_path <- file.path(root_path, "data") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(required_packages) - - Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") - Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - - openhexa <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - } - - config_json <- tryCatch( - { - jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) - }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") - log_msg(msg) - stop(msg) - } - ) +# Report helpers for the IQR outliers pipeline. - return(list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - CONFIG_PATH = config_path, - DATA_PATH = data_path, - openhexa = openhexa, - config_json = config_json - )) +`%||%` <- function(x, y) if (!is.null(x)) x else y + +# Pull in bootstrap + shared non-report helpers (same folder). +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") +} else if (!is.na(.this_file)) { + dirname(.this_file) +} else { + getwd() } +source(file.path(.this_dir, "snt_dhis2_outliers_imputation_iqr.r")) printdim <- function(df, name = deparse(substitute(df))) { - cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") + if (is.null(df)) { + message(sprintf("%s: NULL", name)) + return(invisible(NULL)) + } + d <- dim(df) + message(sprintf("%s: %s x %s", name, d[1], d[2])) + invisible(d) } -plot_outliers <- function(ind_name, df, outlier_col) { - df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) - df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) - - ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + - ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - ggplot2::aes(x = YEAR, y = VALUE), - color = "red", - size = 2.8, - alpha = 0.85, - na.rm = TRUE - ) + - ggplot2::labs( - title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), - subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", - x = "Année", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 14) +plot_outliers <- function(ind_name, df, outlier_col = "OUTLIER_DETECTED") { + if (!ind_name %in% names(df)) return(NULL) + if (!outlier_col %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate( + YEAR = as.integer(.data$YEAR %||% substr(.data$PERIOD, 1, 4)), + MONTH = as.integer(.data$MONTH %||% substr(.data$PERIOD, 5, 6)), + DATE = as.Date(sprintf("%04d-%02d-01", YEAR, MONTH)) + ) %>% + dplyr::group_by(.data$DATE) %>% + dplyr::summarise( + value = sum(.data[[ind_name]], na.rm = TRUE), + has_outlier = any(.data[[outlier_col]] %in% TRUE, na.rm = TRUE), + .groups = "drop" + ) + + ggplot2::ggplot(d, ggplot2::aes(x = .data$DATE, y = .data$value)) + + ggplot2::geom_line(linewidth = 0.8, color = "grey40") + + ggplot2::geom_point(ggplot2::aes(color = .data$has_outlier), size = 2, alpha = 0.9) + + ggplot2::scale_color_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "#0072B2")) + + ggplot2::labs( + title = sprintf("Outliers - %s (%s)", ind_name, outlier_col), + x = "Mois", + y = "Valeur agregee", + color = "Outlier present" + ) + + ggplot2::theme_minimal(base_size = 14) } -plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { - df_ind <- df %>% - dplyr::filter( - INDICATOR == ind_name, - !is.na(YEAR), - !is.na(VALUE), - is.finite(VALUE) - ) - - if (nrow(df_ind) == 0) { - return(NULL) - } - - ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + - ggplot2::geom_point(color = "grey60", alpha = 0.3) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - color = "red", - size = 2.8, - alpha = 0.85 - ) + - ggplot2::facet_wrap(~ YEAR, scales = "free_y") + - ggplot2::labs( - title = paste("Détection des valeurs aberrantes —", ind_name), - subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), - x = "District (ADM2)", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 13) + - ggplot2::theme( - axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7) - ) +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col = "OUTLIER_DETECTED") { + if (!ind_name %in% names(df)) return(NULL) + if (!outlier_col %in% names(df)) return(NULL) + if (!("ADM2_NAME" %in% names(df) && "ADM2_ID" %in% names(df))) return(NULL) + + d <- df %>% + dplyr::mutate( + YEAR = as.integer(.data$YEAR %||% substr(.data$PERIOD, 1, 4)), + MONTH = as.integer(.data$MONTH %||% substr(.data$PERIOD, 5, 6)), + DATE = as.Date(sprintf("%04d-%02d-01", YEAR, MONTH)) + ) %>% + dplyr::group_by(.data$ADM2_ID, .data$ADM2_NAME, .data$YEAR, .data$MONTH, .data$DATE) %>% + dplyr::summarise( + value = sum(.data[[ind_name]], na.rm = TRUE), + has_outlier = any(.data[[outlier_col]] %in% TRUE, na.rm = TRUE), + .groups = "drop" + ) + + if (nrow(d) == 0) return(NULL) + + ggplot2::ggplot( + d, + ggplot2::aes(x = .data$DATE, y = .data$value, group = .data$ADM2_ID) + ) + + ggplot2::geom_line(alpha = 0.35, linewidth = 0.4, color = "grey40") + + ggplot2::geom_point(ggplot2::aes(color = .data$has_outlier), alpha = 0.75, size = 1) + + ggplot2::scale_color_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "grey70")) + + ggplot2::facet_wrap(~YEAR, scales = "free_x") + + ggplot2::labs( + title = sprintf("Outliers par district - %s (%s)", ind_name, outlier_col), + x = "Mois", + y = "Valeur (ADM2 agrege)", + color = "Outlier" + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme( + legend.position = "bottom", + strip.text = ggplot2::element_text(face = "bold") + ) } -plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { - if (!agg_level %in% names(df)) { - stop(paste0("Aggregation level '", agg_level, "' not found in data!")) - } - - df_year <- df %>% - dplyr::filter(YEAR == selected_year) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% - dplyr::summarise( - pct_coherent = mean(pct_coherent, na.rm = TRUE), - .groups = "drop" - ) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% - dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% - dplyr::ungroup() %>% - dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) - - n_units <- dplyr::n_distinct(df_year[[agg_level]]) - plot_height <- max(6, 0.5 * n_units) - agg_label <- if (agg_level == "ADM1_NAME") { - "niveau administratif 1" - } else if (agg_level == "ADM2_NAME") { - "niveau administratif 2" - } else { - agg_level - } - - p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + - ggplot2::geom_tile(color = "white", linewidth = 0.2) + - ggplot2::geom_text( - ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), - size = 5, - fontface = "bold", - color = "white" - ) + - viridis::scale_fill_viridis( - name = "% cohérent", - limits = c(0, 100), - option = "viridis", - direction = -1 - ) + - ggplot2::labs( - title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), - x = "Règle de cohérence", - y = agg_label - ) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - axis.text.y = ggplot2::element_text(size = 12), - axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), - plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), - legend.title = ggplot2::element_text(size = 12), - legend.text = ggplot2::element_text(size = 10) - ) - - options(repr.plot.width = 14, repr.plot.height = plot_height) - - if (!is.null(filename)) { - ggplot2::ggsave( - filename = filename, - plot = p, - width = 14, - height = plot_height, - dpi = 300, - limitsize = FALSE - ) - } - if (do_plot) { - print(p) - } +plot_coherence_heatmap <- function( + df, + selected_year, + agg_level = "ADM1_NAME", + filename = NULL, + do_plot = TRUE +) { + if (!all(c("YEAR", "check_label", "pct_coherent") %in% names(df))) return(NULL) + if (!agg_level %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate(YEAR = as.integer(.data$YEAR)) %>% + dplyr::filter(.data$YEAR == as.integer(selected_year)) %>% + dplyr::mutate( + agg = as.character(.data[[agg_level]]), + check_label = as.character(.data$check_label) + ) + + if (nrow(d) == 0) return(NULL) + + p <- ggplot2::ggplot(d, ggplot2::aes( + x = .data$check_label, + y = .data$agg, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile() + + ggplot2::scale_fill_viridis_c( + name = "% coherent", + option = "viridis", + limits = c(0, 100) + ) + + ggplot2::labs( + title = sprintf("Coherence (%s) - %s", agg_level, selected_year), + x = NULL, + y = NULL + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), + plot.title = ggplot2::element_text(face = "bold") + ) + + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = 8, dpi = 150) + } + + if (do_plot) print(p) + invisible(p) } plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { - if (!col_name %in% names(map_data)) { - stop(paste0("Column '", col_name, "' not found in the data!")) - } - - if (is.null(indicator_label)) { - indicator_label <- col_name - } - - ggplot2::ggplot(map_data) + - ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + - viridis::scale_fill_viridis( - name = paste0("% cohérence\n(", indicator_label, ")"), - option = "magma", - direction = -1, - limits = c(0, 100), - na.value = "grey90" - ) + - ggplot2::facet_wrap(~ YEAR, drop = TRUE) + - ggplot2::labs( - title = "Cohérence des données par niveau administratif 2 et par année", - subtitle = paste("Indicateur :", indicator_label), - caption = "Source : DHIS2 données routinières" - ) + - ggplot2::theme_minimal(base_size = 15) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - strip.text = ggplot2::element_text(size = 14, face = "bold"), - plot.title = ggplot2::element_text(size = 20, face = "bold"), - legend.position = "right" - ) + if (!inherits(map_data, "sf")) return(NULL) + if (!col_name %in% names(map_data)) return(NULL) + + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = NA) + + ggplot2::scale_fill_viridis_c( + option = "viridis", + name = indicator_label %||% col_name, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::labs(title = indicator_label %||% col_name) + + ggplot2::theme_void(base_size = 12) + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", hjust = 0.5), + legend.position = "right" + ) +} + +get_coherence_definitions <- function() { + checks <- list( + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") + ) + + check_labels <- c( + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" + ) + + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) + ) +} + +plot_national_coherence_heatmap <- function(coherency_metrics) { + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 + ) + + ggplot2::labs( + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") + ) +} + +compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) + ) %>% + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) } From 776887dab5ea567c374461cb280064b80d83045c Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 26 Mar 2026 13:19:15 +0100 Subject: [PATCH 07/17] fix for outliers pipelines --- ...s2_outliers_imputation_magic_glasses.ipynb | 4 +- ...iers_imputation_magic_glasses_report.ipynb | 2 +- ..._dhis2_outliers_imputation_magic_glasses.r | 6 + ...outliers_imputation_magic_glasses_report.r | 5 + .../snt_dhis2_outliers_imputation_mean.ipynb | 9 +- ...his2_outliers_imputation_mean_report.ipynb | 168 +++--------------- .../snt_dhis2_outliers_imputation_mean.r | 6 + ...nt_dhis2_outliers_imputation_mean_report.r | 150 ++++++++++++++++ ...snt_dhis2_outliers_imputation_median.ipynb | 9 +- ...s2_outliers_imputation_median_report.ipynb | 168 +++--------------- .../snt_dhis2_outliers_imputation_median.r | 6 + ..._dhis2_outliers_imputation_median_report.r | 150 ++++++++++++++++ 12 files changed, 377 insertions(+), 306 deletions(-) create mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r create mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r create mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r create mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r create mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r create mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index 78a0083..211d395 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -64,7 +64,7 @@ "outputs": [], "source": [ "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_magic_glasses.r\"))\n", "\n", "required_packages <- c(\"arrow\", \"data.table\", \"jsonlite\", \"reticulate\", \"glue\")\n", "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", @@ -84,8 +84,6 @@ "DATA_PATH <- setup_ctx$DATA_PATH\n", "openhexa <- setup_ctx$openhexa\n", "\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"magic_glasses_utils.R\"))\n", - "\n", "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", " log_msg(\"[WARNING] Complete mode: seasonal detection is very computationally intensive and can take several hours to run.\", \"warning\")\n", "}\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb index 3ceddcb..2340133 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb @@ -30,7 +30,7 @@ "ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", "\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_magic_glasses_report.r\"))\n", "setup_ctx <- bootstrap_magic_glasses_context(\n", " root_path = ROOT_PATH,\n", " required_packages = c(\"jsonlite\", \"arrow\", \"glue\", \"reticulate\", \"dplyr\", \"ggplot2\", \"knitr\", \"scales\")\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r new file mode 100644 index 0000000..af6ad57 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r @@ -0,0 +1,6 @@ +# Main helpers for magic glasses outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "magic_glasses_utils.R")) + diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r new file mode 100644 index 0000000..af9912d --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r @@ -0,0 +1,5 @@ +# Report helpers for magic glasses outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) + diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index 62c5d49..cdefca5 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -61,8 +61,8 @@ "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_mean\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (code)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_mean.r\"))\n", "setup_ctx <- bootstrap_outliers_context(\n", " root_path = ROOT_PATH,\n", " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", @@ -71,10 +71,7 @@ "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Pipeline-specific helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb index ce58bf5..ca1ba0d 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb @@ -37,8 +37,8 @@ "SNT_ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_mean\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (report)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_mean_report.r\"))\n", "setup_ctx <- bootstrap_outliers_context(\n", " root_path = SNT_ROOT_PATH,\n", " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", @@ -46,10 +46,7 @@ "\n", "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Reporting helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { @@ -480,24 +477,10 @@ }, "outputs": [], "source": [ - "# ---- 0. Define the checks, columns and labels ----\n", - "checks <- list(\n", - " allout_susp = c(\"ALLOUT\", \"SUSP\"), \n", - " allout_test = c(\"ALLOUT\", \"TEST\"), \n", - " susp_test = c(\"SUSP\", \"TEST\"), \n", - " test_conf = c(\"TEST\", \"CONF\"), \n", - " conf_treat = c(\"CONF\", \"MALTREAT\"), \n", - " adm_dth = c(\"MALADM\", \"MALDTH\") \n", - ")\n", - "\n", - "check_labels <- c(\n", - " pct_coherent_allout_susp = \"Ambulatoire ≥ Suspects\",\n", - " pct_coherent_allout_test = \"Ambulatoire ≥ Testés\",\n", - " pct_coherent_susp_test = \"Suspects ≥ Testés\",\n", - " pct_coherent_test_conf = \"Testés ≥ Confirmés\",\n", - " pct_coherent_conf_treat = \"Confirmés ≥ Traités\",\n", - " pct_coherent_adm_dth = \"Admissions Palu ≥ Décès Palu\"\n", - ")" + "# Coherence definitions loaded from utils/snt_dhis2_outliers_imputation_mean_report.r\n", + "defs <- get_coherence_definitions()\n", + "checks <- defs$checks\n", + "check_labels <- defs$check_labels" ] }, { @@ -511,83 +494,14 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency checks dynamically ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# ---- 2. Summarise percent coherent per year ----\n", - "check_cols <- intersect(paste0(\"check_\", names(checks)), names(df_checks))\n", - "\n", - "coherency_metrics <- df_checks %>%\n", - " group_by(YEAR) %>%\n", - " summarise(\n", - " across(all_of(check_cols), ~ mean(.x, na.rm = TRUE) * 100,\n", - " .names = \"pct_{.col}\"),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"check_type\",\n", - " names_prefix = \"pct_check_\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent)) %>% # <-- remove missing checks entirely\n", - " mutate(\n", - " check_label = recode(\n", - " check_type,\n", - " !!!setNames(check_labels, sub(\"^pct_coherent_\", \"\", names(check_labels)))\n", - " ),\n", - " check_label = factor(check_label, levels = unique(check_label)), # preserve only existing levels\n", - " check_label = fct_reorder(check_label, pct_coherent, .fun = median, na.rm = TRUE)\n", - " )\n", - "\n", - "# ---- 3. Heatmap ----\n", - "coherency_plot <- ggplot(coherency_metrics, aes(\n", - " x = factor(YEAR),\n", - " y = check_label,\n", - " fill = pct_coherent\n", - ")) +\n", - " geom_tile(color = NA, width = 0.88, height = 0.88) +\n", - " geom_text(\n", - " aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " color = \"white\",\n", - " fontface = \"bold\",\n", - " size = 5\n", - " ) +\n", - " scale_fill_viridis(\n", - " name = \"% Cohérent\",\n", - " option = \"viridis\",\n", - " limits = c(0, 100),\n", - " direction = -1\n", - " ) +\n", - " labs(\n", - " title = \"Contrôles de cohérence des données (niveau national)\",\n", - " x = \"Année\",\n", - " y = NULL\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " plot.title = element_text(size = 22, face = \"bold\", hjust = 0.5),\n", - " axis.text.y = element_text(size = 16, hjust = 0),\n", - " axis.text.x = element_text(size = 16),\n", - " legend.title = element_text(size = 16, face = \"bold\"),\n", - " legend.text = element_text(size = 14),\n", - " legend.key.width = unit(0.7, \"cm\"),\n", - " legend.key.height = unit(1.2, \"cm\")\n", - " )\n", + "# National coherence summary and plot via report utils\n", + "coherency_metrics <- compute_national_coherency_metrics(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels\n", + ")\n", "\n", + "coherency_plot <- plot_national_coherence_heatmap(coherency_metrics)\n", "coherency_plot" ] }, @@ -610,52 +524,16 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency check per row safely ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA_real_)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# Identify the check columns that actually exist\n", - "check_cols <- names(df_checks)[grepl(\"^check_\", names(df_checks))]\n", - "\n", - "valid_checks <- check_cols[\n", - " purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x)))\n", - "]\n", - "\n", - "# Compute coherence\n", - "adm_coherence <- df_checks %>%\n", - " group_by(ADM1_NAME, ADM2_NAME, ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " total_reports = n(),\n", - " !!!purrr::map(\n", - " valid_checks,\n", - " ~ expr(100 * mean(.data[[.x]], na.rm = TRUE))\n", - " ) %>%\n", - " setNames(paste0(\"pct_coherent_\", sub(\"^check_\", \"\", valid_checks))),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " filter(total_reports >= 5)\n", - "\n", - "# To long format\n", - "adm_long <- adm_coherence %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_coherent_\"),\n", - " names_to = \"check_type\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent))\n", + "# ADM coherence summaries via report utils\n", + "adm_coherence_data <- compute_adm_coherence_long(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels,\n", + " min_reports = 5\n", + ")\n", "\n", - "adm_long <- adm_long %>% mutate(check_label = recode(check_type, !!!check_labels))\n", + "adm_coherence <- adm_coherence_data$adm_coherence\n", + "adm_long <- adm_coherence_data$adm_long\n", "\n", "head(adm_long)" ] diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r new file mode 100644 index 0000000..df776cd --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r @@ -0,0 +1,6 @@ +# Main helpers for mean outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "imputation_utils.R")) + diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r new file mode 100644 index 0000000..7ff945a --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r @@ -0,0 +1,150 @@ +# Report helpers for mean outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "reporting_utils.R")) + +get_coherence_definitions <- function() { + checks <- list( + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") + ) + + check_labels <- c( + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" + ) + + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) + ) +} + +plot_national_coherence_heatmap <- function(coherency_metrics) { + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 + ) + + ggplot2::labs( + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") + ) +} + +compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) + ) %>% + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) +} + diff --git a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb index 4c81480..a16e999 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb @@ -61,8 +61,8 @@ "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_median\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (code)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_median.r\"))\n", "setup_ctx <- bootstrap_outliers_context(\n", " root_path = ROOT_PATH,\n", " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", @@ -71,10 +71,7 @@ "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Pipeline-specific helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb index 04b02b6..854ca4e 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb @@ -37,8 +37,8 @@ "SNT_ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_median\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (report)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_median_report.r\"))\n", "setup_ctx <- bootstrap_outliers_context(\n", " root_path = SNT_ROOT_PATH,\n", " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", @@ -46,10 +46,7 @@ "\n", "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Reporting helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { @@ -480,24 +477,10 @@ }, "outputs": [], "source": [ - "# ---- 0. Define the checks, columns and labels ----\n", - "checks <- list(\n", - " allout_susp = c(\"ALLOUT\", \"SUSP\"), \n", - " allout_test = c(\"ALLOUT\", \"TEST\"), \n", - " susp_test = c(\"SUSP\", \"TEST\"), \n", - " test_conf = c(\"TEST\", \"CONF\"), \n", - " conf_treat = c(\"CONF\", \"MALTREAT\"), \n", - " adm_dth = c(\"MALADM\", \"MALDTH\") \n", - ")\n", - "\n", - "check_labels <- c(\n", - " pct_coherent_allout_susp = \"Ambulatoire ≥ Suspects\",\n", - " pct_coherent_allout_test = \"Ambulatoire ≥ Testés\",\n", - " pct_coherent_susp_test = \"Suspects ≥ Testés\",\n", - " pct_coherent_test_conf = \"Testés ≥ Confirmés\",\n", - " pct_coherent_conf_treat = \"Confirmés ≥ Traités\",\n", - " pct_coherent_adm_dth = \"Admissions Palu ≥ Décès Palu\"\n", - ")" + "# Coherence definitions loaded from utils/snt_dhis2_outliers_imputation_median_report.r\n", + "defs <- get_coherence_definitions()\n", + "checks <- defs$checks\n", + "check_labels <- defs$check_labels" ] }, { @@ -511,83 +494,14 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency checks dynamically ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# ---- 2. Summarise percent coherent per year ----\n", - "check_cols <- intersect(paste0(\"check_\", names(checks)), names(df_checks))\n", - "\n", - "coherency_metrics <- df_checks %>%\n", - " group_by(YEAR) %>%\n", - " summarise(\n", - " across(all_of(check_cols), ~ mean(.x, na.rm = TRUE) * 100,\n", - " .names = \"pct_{.col}\"),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"check_type\",\n", - " names_prefix = \"pct_check_\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent)) %>% # <-- remove missing checks entirely\n", - " mutate(\n", - " check_label = recode(\n", - " check_type,\n", - " !!!setNames(check_labels, sub(\"^pct_coherent_\", \"\", names(check_labels)))\n", - " ),\n", - " check_label = factor(check_label, levels = unique(check_label)), # preserve only existing levels\n", - " check_label = fct_reorder(check_label, pct_coherent, .fun = median, na.rm = TRUE)\n", - " )\n", - "\n", - "# ---- 3. Heatmap ----\n", - "coherency_plot <- ggplot(coherency_metrics, aes(\n", - " x = factor(YEAR),\n", - " y = check_label,\n", - " fill = pct_coherent\n", - ")) +\n", - " geom_tile(color = NA, width = 0.88, height = 0.88) +\n", - " geom_text(\n", - " aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " color = \"white\",\n", - " fontface = \"bold\",\n", - " size = 5\n", - " ) +\n", - " scale_fill_viridis(\n", - " name = \"% Cohérent\",\n", - " option = \"viridis\",\n", - " limits = c(0, 100),\n", - " direction = -1\n", - " ) +\n", - " labs(\n", - " title = \"Contrôles de cohérence des données (niveau national)\",\n", - " x = \"Année\",\n", - " y = NULL\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " plot.title = element_text(size = 22, face = \"bold\", hjust = 0.5),\n", - " axis.text.y = element_text(size = 16, hjust = 0),\n", - " axis.text.x = element_text(size = 16),\n", - " legend.title = element_text(size = 16, face = \"bold\"),\n", - " legend.text = element_text(size = 14),\n", - " legend.key.width = unit(0.7, \"cm\"),\n", - " legend.key.height = unit(1.2, \"cm\")\n", - " )\n", + "# National coherence summary and plot via report utils\n", + "coherency_metrics <- compute_national_coherency_metrics(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels\n", + ")\n", "\n", + "coherency_plot <- plot_national_coherence_heatmap(coherency_metrics)\n", "coherency_plot" ] }, @@ -610,52 +524,16 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency check per row safely ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA_real_)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# Identify the check columns that actually exist\n", - "check_cols <- names(df_checks)[grepl(\"^check_\", names(df_checks))]\n", - "\n", - "valid_checks <- check_cols[\n", - " purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x)))\n", - "]\n", - "\n", - "# Compute coherence\n", - "adm_coherence <- df_checks %>%\n", - " group_by(ADM1_NAME, ADM2_NAME, ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " total_reports = n(),\n", - " !!!purrr::map(\n", - " valid_checks,\n", - " ~ expr(100 * mean(.data[[.x]], na.rm = TRUE))\n", - " ) %>%\n", - " setNames(paste0(\"pct_coherent_\", sub(\"^check_\", \"\", valid_checks))),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " filter(total_reports >= 5)\n", - "\n", - "# To long format\n", - "adm_long <- adm_coherence %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_coherent_\"),\n", - " names_to = \"check_type\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent))\n", + "# ADM coherence summaries via report utils\n", + "adm_coherence_data <- compute_adm_coherence_long(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels,\n", + " min_reports = 5\n", + ")\n", "\n", - "adm_long <- adm_long %>% mutate(check_label = recode(check_type, !!!check_labels))\n", + "adm_coherence <- adm_coherence_data$adm_coherence\n", + "adm_long <- adm_coherence_data$adm_long\n", "\n", "head(adm_long)" ] diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r new file mode 100644 index 0000000..b5052dd --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r @@ -0,0 +1,6 @@ +# Main helpers for median outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "imputation_utils.R")) + diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r new file mode 100644 index 0000000..393a589 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r @@ -0,0 +1,150 @@ +# Report helpers for median outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "reporting_utils.R")) + +get_coherence_definitions <- function() { + checks <- list( + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") + ) + + check_labels <- c( + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" + ) + + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) + ) +} + +plot_national_coherence_heatmap <- function(coherency_metrics) { + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 + ) + + ggplot2::labs( + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") + ) +} + +compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) + ) %>% + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) +} + From b882af154e635e64c52aa6b91e5b19cfa7f73673 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 26 Mar 2026 13:25:16 +0100 Subject: [PATCH 08/17] fix names and deleted old files --- .../utils/bootstrap.R | 28 -- .../utils/magic_glasses_utils.R | 97 ------ ..._dhis2_outliers_imputation_magic_glasses.r | 129 +++++++- ...outliers_imputation_magic_glasses_report.r | 2 +- .../utils/bootstrap.R | 45 --- .../utils/imputation_utils.R | 39 --- .../utils/reporting_utils.R | 124 -------- .../snt_dhis2_outliers_imputation_mean.r | 88 +++++- ...nt_dhis2_outliers_imputation_mean_report.r | 294 ++++++++++-------- .../utils/bootstrap.R | 45 --- .../utils/imputation_utils.R | 39 --- .../utils/reporting_utils.R | 124 -------- .../snt_dhis2_outliers_imputation_median.r | 88 +++++- ..._dhis2_outliers_imputation_median_report.r | 294 ++++++++++-------- 14 files changed, 618 insertions(+), 818 deletions(-) delete mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R deleted file mode 100644 index 3e1d63d..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R +++ /dev/null @@ -1,28 +0,0 @@ -# Shared bootstrap for Magic Glasses notebooks. -bootstrap_magic_glasses_context <- function( - root_path = "~/workspace", - required_packages = c("arrow", "data.table", "jsonlite", "reticulate", "glue"), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - config_path <- file.path(root_path, "configuration") - data_path <- file.path(root_path, "data") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(unique(required_packages)) - - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - - openhexa <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - } - - return(list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - CONFIG_PATH = config_path, - DATA_PATH = data_path, - openhexa = openhexa - )) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R deleted file mode 100644 index ce57811..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R +++ /dev/null @@ -1,97 +0,0 @@ -detect_outliers_mad_custom <- function(dt, deviation) { - flag_col <- paste0("OUTLIER_MAD", deviation) - dt <- data.table::copy(dt) - dt[, median_val := median(VALUE, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] - dt[, mad_val := mad(VALUE, constant = 1, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] - dt[, (flag_col) := (VALUE > (median_val + deviation * mad_val)) | (VALUE < (median_val - deviation * mad_val))] - dt[is.na(get(flag_col)), (flag_col) := FALSE] - dt[, c("median_val", "mad_val") := NULL] - dt -} - -detect_seasonal_outliers <- function(dt, deviation, workers = 1) { - outlier_col <- paste0("OUTLIER_SEASONAL", deviation) - dt <- data.table::copy(dt) - data.table::setorder(dt, OU_ID, INDICATOR, PERIOD) - - process_group <- function(sub_dt) { - n_valid <- sum(!is.na(sub_dt$VALUE)) - if (n_valid < 2) { - return(data.table::data.table( - PERIOD = sub_dt$PERIOD, - OU_ID = sub_dt$OU_ID, - INDICATOR = sub_dt$INDICATOR, - OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) - )) - } - - values <- as.numeric(sub_dt$VALUE) - ts_data <- stats::ts(values, frequency = 12) - cleaned_ts <- tryCatch( - forecast::tsclean(ts_data, replace.missing = TRUE), - error = function(e) ts_data - ) - mad_val <- mad(values, constant = 1, na.rm = TRUE) - - if (is.na(mad_val) || mad_val == 0) { - return(data.table::data.table( - PERIOD = sub_dt$PERIOD, - OU_ID = sub_dt$OU_ID, - INDICATOR = sub_dt$INDICATOR, - OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) - )) - } - - is_outlier <- abs(as.numeric(ts_data) - as.numeric(cleaned_ts)) / mad_val >= deviation - is_outlier[is.na(is_outlier)] <- FALSE - - data.table::data.table( - PERIOD = sub_dt$PERIOD, - OU_ID = sub_dt$OU_ID, - INDICATOR = sub_dt$INDICATOR, - OUTLIER_FLAG = as.logical(is_outlier) - ) - } - - group_keys <- unique(dt[, .(OU_ID, INDICATOR)]) - group_list <- lapply(seq_len(nrow(group_keys)), function(i) { - dt[OU_ID == group_keys$OU_ID[i] & INDICATOR == group_keys$INDICATOR[i]] - }) - - if (workers > 1 && requireNamespace("future.apply", quietly = TRUE)) { - result_list <- future.apply::future_lapply(group_list, process_group, future.seed = TRUE) - } else { - result_list <- lapply(group_list, process_group) - } - - outlier_flags <- data.table::rbindlist(result_list, use.names = TRUE) - data.table::setnames(outlier_flags, "OUTLIER_FLAG", outlier_col) - - result_dt <- merge(dt, outlier_flags, by = c("PERIOD", "OU_ID", "INDICATOR"), all.x = TRUE) - result_dt[is.na(get(outlier_col)), (outlier_col) := FALSE] - result_dt -} - -to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_names) { - routine_wide <- data.table::dcast( - dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)], - PERIOD + YEAR + MONTH + ADM1_ID + ADM2_ID + OU_ID ~ INDICATOR, - value.var = "VALUE" - ) - - routine_wide <- merge(routine_wide, unique(pyramid_names), by = c("ADM1_ID", "ADM2_ID", "OU_ID"), all.x = TRUE) - - target_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", indicators_to_keep) - for (col in setdiff(target_cols, names(routine_wide))) { - if (col %in% indicators_to_keep) { - routine_wide[, (col) := NA_real_] - } else if (col %in% c("YEAR", "MONTH")) { - routine_wide[, (col) := NA_integer_] - } else { - routine_wide[, (col) := NA_character_] - } - } - cols_to_keep <- intersect(target_cols, names(routine_wide)) - routine_wide <- routine_wide[, ..cols_to_keep] - routine_wide -} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r index af6ad57..28ec619 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r @@ -1,6 +1,127 @@ # Main helpers for magic glasses outliers imputation pipeline. -.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) -source(file.path(.this_dir, "magic_glasses_utils.R")) +bootstrap_magic_glasses_context <- function( + root_path = "~/workspace", + required_packages = c("arrow", "data.table", "jsonlite", "reticulate", "glue"), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(unique(required_packages)) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa + )) +} + +detect_outliers_mad_custom <- function(dt, deviation) { + flag_col <- paste0("OUTLIER_MAD", deviation) + dt <- data.table::copy(dt) + dt[, median_val := median(VALUE, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] + dt[, mad_val := mad(VALUE, constant = 1, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] + dt[, (flag_col) := (VALUE > (median_val + deviation * mad_val)) | (VALUE < (median_val - deviation * mad_val))] + dt[is.na(get(flag_col)), (flag_col) := FALSE] + dt[, c("median_val", "mad_val") := NULL] + dt +} + +detect_seasonal_outliers <- function(dt, deviation, workers = 1) { + outlier_col <- paste0("OUTLIER_SEASONAL", deviation) + dt <- data.table::copy(dt) + data.table::setorder(dt, OU_ID, INDICATOR, PERIOD) + + process_group <- function(sub_dt) { + n_valid <- sum(!is.na(sub_dt$VALUE)) + if (n_valid < 2) { + return(data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) + )) + } + + values <- as.numeric(sub_dt$VALUE) + ts_data <- stats::ts(values, frequency = 12) + cleaned_ts <- tryCatch( + forecast::tsclean(ts_data, replace.missing = TRUE), + error = function(e) ts_data + ) + mad_val <- mad(values, constant = 1, na.rm = TRUE) + + if (is.na(mad_val) || mad_val == 0) { + return(data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) + )) + } + + is_outlier <- abs(as.numeric(ts_data) - as.numeric(cleaned_ts)) / mad_val >= deviation + is_outlier[is.na(is_outlier)] <- FALSE + + data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = as.logical(is_outlier) + ) + } + + group_keys <- unique(dt[, .(OU_ID, INDICATOR)]) + group_list <- lapply(seq_len(nrow(group_keys)), function(i) { + dt[OU_ID == group_keys$OU_ID[i] & INDICATOR == group_keys$INDICATOR[i]] + }) + + if (workers > 1 && requireNamespace("future.apply", quietly = TRUE)) { + result_list <- future.apply::future_lapply(group_list, process_group, future.seed = TRUE) + } else { + result_list <- lapply(group_list, process_group) + } + + outlier_flags <- data.table::rbindlist(result_list, use.names = TRUE) + data.table::setnames(outlier_flags, "OUTLIER_FLAG", outlier_col) + + result_dt <- merge(dt, outlier_flags, by = c("PERIOD", "OU_ID", "INDICATOR"), all.x = TRUE) + result_dt[is.na(get(outlier_col)), (outlier_col) := FALSE] + result_dt +} + +to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_names) { + routine_wide <- data.table::dcast( + dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)], + PERIOD + YEAR + MONTH + ADM1_ID + ADM2_ID + OU_ID ~ INDICATOR, + value.var = "VALUE" + ) + + routine_wide <- merge(routine_wide, unique(pyramid_names), by = c("ADM1_ID", "ADM2_ID", "OU_ID"), all.x = TRUE) + + target_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", indicators_to_keep) + for (col in setdiff(target_cols, names(routine_wide))) { + if (col %in% indicators_to_keep) { + routine_wide[, (col) := NA_real_] + } else if (col %in% c("YEAR", "MONTH")) { + routine_wide[, (col) := NA_integer_] + } else { + routine_wide[, (col) := NA_character_] + } + } + cols_to_keep <- intersect(target_cols, names(routine_wide)) + routine_wide <- routine_wide[, ..cols_to_keep] + routine_wide +} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r index af9912d..8c531c5 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r @@ -1,5 +1,5 @@ # Report helpers for magic glasses outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) .this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "snt_dhis2_outliers_imputation_magic_glasses.r")) diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R deleted file mode 100644 index 8642d85..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R +++ /dev/null @@ -1,45 +0,0 @@ -# Shared bootstrap for outliers notebooks. -bootstrap_outliers_context <- function( - root_path = "~/workspace", - required_packages = c( - "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", - "reticulate", "glue", "zoo" - ), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - config_path <- file.path(root_path, "configuration") - data_path <- file.path(root_path, "data") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(required_packages) - - Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") - Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - - openhexa <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - } - - config_json <- tryCatch( - { - jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) - }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") - log_msg(msg) - stop(msg) - } - ) - - return(list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - CONFIG_PATH = config_path, - DATA_PATH = data_path, - openhexa = openhexa, - config_json = config_json - )) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R deleted file mode 100644 index 72f70e7..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R +++ /dev/null @@ -1,39 +0,0 @@ -impute_outliers_dt <- function(dt, outlier_col) { - dt <- data.table::as.data.table(dt) - data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) - dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] - dt[, MOVING_AVG := data.table::frollapply( - TO_IMPUTE, - n = 3, - FUN = function(x) ceiling(mean(x, na.rm = TRUE)), - align = "center" - ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] - dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] - dt[, c("TO_IMPUTE") := NULL] - return(as.data.frame(data.table::copy(dt))) -} - -format_routine_data_selection <- function( - df, - outlier_column, - DHIS2_INDICATORS, - fixed_cols, - pyramid_names, - remove = FALSE -) { - if (remove) { - df <- df %>% dplyr::filter(!.data[[outlier_column]]) - } - target_cols <- c( - "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", - "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS - ) - output <- df %>% - dplyr::select(-VALUE) %>% - dplyr::rename(VALUE = VALUE_IMPUTED) %>% - dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% - dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% - tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% - dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) - return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R deleted file mode 100644 index 719f4f6..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R +++ /dev/null @@ -1,124 +0,0 @@ -printdim <- function(df, name = deparse(substitute(df))) { - cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") -} - -plot_outliers <- function(ind_name, df, outlier_col) { - df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) - df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) - ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + - ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - ggplot2::aes(x = YEAR, y = VALUE), - color = "red", - size = 2.8, - alpha = 0.85, - na.rm = TRUE - ) + - ggplot2::labs( - title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), - subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", - x = "Année", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 14) -} - -plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { - df_ind <- df %>% - dplyr::filter( - INDICATOR == ind_name, - !is.na(YEAR), - !is.na(VALUE), - is.finite(VALUE) - ) - if (nrow(df_ind) == 0) { - return(NULL) - } - ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + - ggplot2::geom_point(color = "grey60", alpha = 0.3) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - color = "red", - size = 2.8, - alpha = 0.85 - ) + - ggplot2::facet_wrap(~ YEAR, scales = "free_y") + - ggplot2::labs( - title = paste("Détection des valeurs aberrantes —", ind_name), - subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), - x = "District (ADM2)", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 13) + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7)) -} - -plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { - if (!agg_level %in% names(df)) { - stop(paste0("Aggregation level '", agg_level, "' not found in data!")) - } - df_year <- df %>% - dplyr::filter(YEAR == selected_year) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% - dplyr::summarise(pct_coherent = mean(pct_coherent, na.rm = TRUE), .groups = "drop") %>% - dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% - dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% - dplyr::ungroup() %>% - dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) - n_units <- dplyr::n_distinct(df_year[[agg_level]]) - plot_height <- max(6, 0.5 * n_units) - agg_label <- if (agg_level == "ADM1_NAME") "niveau administratif 1" else if (agg_level == "ADM2_NAME") "niveau administratif 2" else agg_level - p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + - ggplot2::geom_tile(color = "white", linewidth = 0.2) + - ggplot2::geom_text(ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), size = 5, fontface = "bold", color = "white") + - viridis::scale_fill_viridis(name = "% cohérent", limits = c(0, 100), option = "viridis", direction = -1) + - ggplot2::labs(title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), x = "Règle de cohérence", y = agg_label) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - axis.text.y = ggplot2::element_text(size = 12), - axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), - plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), - legend.title = ggplot2::element_text(size = 12), - legend.text = ggplot2::element_text(size = 10) - ) - options(repr.plot.width = 14, repr.plot.height = plot_height) - if (!is.null(filename)) { - ggplot2::ggsave(filename = filename, plot = p, width = 14, height = plot_height, dpi = 300, limitsize = FALSE) - } - if (do_plot) { - print(p) - } -} - -plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { - if (!col_name %in% names(map_data)) { - stop(paste0("Column '", col_name, "' not found in the data!")) - } - if (is.null(indicator_label)) { - indicator_label <- col_name - } - ggplot2::ggplot(map_data) + - ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + - viridis::scale_fill_viridis( - name = paste0("% cohérence\n(", indicator_label, ")"), - option = "magma", - direction = -1, - limits = c(0, 100), - na.value = "grey90" - ) + - ggplot2::facet_wrap(~ YEAR, drop = TRUE) + - ggplot2::labs( - title = "Cohérence des données par niveau administratif 2 et par année", - subtitle = paste("Indicateur :", indicator_label), - caption = "Source : DHIS2 données routinières" - ) + - ggplot2::theme_minimal(base_size = 15) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - strip.text = ggplot2::element_text(size = 14, face = "bold"), - plot.title = ggplot2::element_text(size = 20, face = "bold"), - legend.position = "right" - ) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r index df776cd..474f342 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r @@ -1,6 +1,86 @@ # Main helpers for mean outliers imputation pipeline. -.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) -source(file.path(.this_dir, "imputation_utils.R")) +bootstrap_outliers_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} + +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + dt[, MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + dt[, c("TO_IMPUTE") := NULL] + return(as.data.frame(data.table::copy(dt))) +} + +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r index 7ff945a..e9dd658 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r @@ -1,150 +1,180 @@ # Report helpers for mean outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) .this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) -source(file.path(.this_dir, "reporting_utils.R")) +source(file.path(.this_dir, "snt_dhis2_outliers_imputation_mean.r")) -get_coherence_definitions <- function() { - checks <- list( - allout_susp = c("ALLOUT", "SUSP"), - allout_test = c("ALLOUT", "TEST"), - susp_test = c("SUSP", "TEST"), - test_conf = c("TEST", "CONF"), - conf_treat = c("CONF", "MALTREAT"), - adm_dth = c("MALADM", "MALDTH") - ) +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} - check_labels <- c( - pct_coherent_allout_susp = "Ambulatoire >= Suspects", - pct_coherent_allout_test = "Ambulatoire >= Testes", - pct_coherent_susp_test = "Suspects >= Testes", - pct_coherent_test_conf = "Testes >= Confirmes", - pct_coherent_conf_treat = "Confirmes >= Traites", - pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" - ) +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Outliers for indicator:", ind_name), + subtitle = "Grey = all values, red = detected outliers", + x = "Year", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 14) +} - list(checks = checks, check_labels = check_labels) +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + if (nrow(df_ind) == 0) { + return(NULL) + } + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Outliers by district and year:", ind_name), + x = "District", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 12) } -compute_national_coherency_metrics <- function(df, checks, check_labels) { - df_checks <- df %>% - dplyr::mutate( - !!!lapply(names(checks), function(check_name) { - cols <- checks[[check_name]] - if (all(cols %in% names(df))) { - rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) - } else { - rlang::expr(NA) - } - }) %>% stats::setNames(paste0("check_", names(checks))) - ) +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!(agg_level %in% c("ADM1_NAME", "ADM2_NAME"))) stop("agg_level must be ADM1_NAME or ADM2_NAME") + if (!all(c("INDICATOR", "YEAR", agg_level, "VALUE", "VALUE_IMPUTED") %in% colnames(df))) { + stop("Data frame is missing required columns.") + } + comp <- df %>% + dplyr::filter(YEAR == selected_year) %>% + dplyr::group_by(INDICATOR, !!rlang::sym(agg_level)) %>% + dplyr::summarise( + coherence = ifelse(sum(!is.na(VALUE)) == 0, NA, sum(VALUE == VALUE_IMPUTED, na.rm = TRUE) / sum(!is.na(VALUE))), + n = dplyr::n(), + .groups = "drop" + ) + p <- ggplot2::ggplot(comp, ggplot2::aes(x = .data[[agg_level]], y = INDICATOR, fill = coherence)) + + ggplot2::geom_tile(color = "white", linewidth = 0.2) + + ggplot2::scale_fill_gradient(low = "#fee5d9", high = "#a50f15", na.value = "grey90", limits = c(0, 1)) + + ggplot2::labs( + title = paste("Coherence heatmap -", agg_level, "-", selected_year), + x = agg_level, + y = "Indicator", + fill = "Coherence" + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + if (!is.null(filename)) ggplot2::ggsave(filename, p, width = 12, height = 6) + if (isTRUE(do_plot)) print(p) + invisible(p) +} - check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!inherits(map_data, "sf")) stop("map_data must be an sf object.") + if (!(col_name %in% names(map_data))) stop(paste("Column", col_name, "not found in map_data.")) + ttl <- ifelse(is.null(indicator_label), paste("Map of", col_name), paste("Map of", col_name, "-", indicator_label)) + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "grey30", linewidth = 0.1) + + ggplot2::scale_fill_viridis_c(option = "C", na.value = "grey90") + + ggplot2::labs(title = ttl, fill = col_name) + + ggplot2::theme_minimal(base_size = 12) +} - df_checks %>% - dplyr::group_by(.data$YEAR) %>% - dplyr::summarise( - dplyr::across( - dplyr::all_of(check_cols), - ~ mean(.x, na.rm = TRUE) * 100, - .names = "pct_{.col}" - ), - .groups = "drop" - ) %>% - tidyr::pivot_longer( - cols = dplyr::starts_with("pct_"), - names_to = "check_type", - names_prefix = "pct_check_", - values_to = "pct_coherent" - ) %>% - dplyr::filter(!is.na(.data$pct_coherent)) %>% - dplyr::mutate( - check_label = dplyr::recode( - .data$check_type, - !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) - ), - check_label = factor(.data$check_label, levels = unique(.data$check_label)), - check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) +get_coherence_definitions <- function() { + checks <- list( + "long_term" = function(x) (x >= 0.95), + "short_term" = function(x) (x >= 0.95), + "cyclicality" = function(x) (x >= 0.90), + "volatility" = function(x) (x >= 0.90), + "rolling_sd" = function(x) (x <= 0.80), + "spatial" = function(x) (x <= 0.80), + "residual" = function(x) (x <= 2), + "trend_strength" = function(x) (x >= 0.20) ) + check_labels <- c( + "long_term" = "Long-term (>= 95%)", + "short_term" = "Short-term (>= 95%)", + "cyclicality" = "Cyclicality (>= 90%)", + "volatility" = "Volatility (>= 90%)", + "rolling_sd" = "Rolling SD (<= 80%)", + "spatial" = "Spatial (<= 80%)", + "residual" = "Residual (<= 2)", + "trend_strength" = "Trend strength (>= 20%)" + ) + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + coherency_metrics <- purrr::imap_dfr(checks, function(cond, check_name) { + vals <- df[[check_name]] + tibble::tibble( + check = check_name, + label = check_labels[[check_name]], + percent = round(100 * mean(cond(vals), na.rm = TRUE), 1) + ) + }) + coherency_metrics$label <- factor(coherency_metrics$label, levels = rev(check_labels)) + coherency_metrics } plot_national_coherence_heatmap <- function(coherency_metrics) { - ggplot2::ggplot(coherency_metrics, ggplot2::aes( - x = factor(.data$YEAR), - y = .data$check_label, - fill = .data$pct_coherent - )) + - ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + - ggplot2::geom_text( - ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), - color = "white", - fontface = "bold", - size = 5 - ) + - viridis::scale_fill_viridis( - name = "% Coherent", - option = "viridis", - limits = c(0, 100), - direction = -1 - ) + - ggplot2::labs( - title = "Controles de coherence des donnees (niveau national)", - x = "Annee", - y = NULL - ) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), - axis.text.y = ggplot2::element_text(size = 16, hjust = 0), - axis.text.x = ggplot2::element_text(size = 16), - legend.title = ggplot2::element_text(size = 16, face = "bold"), - legend.text = ggplot2::element_text(size = 14), - legend.key.width = grid::unit(0.7, "cm"), - legend.key.height = grid::unit(1.2, "cm") - ) + ggplot2::ggplot(coherency_metrics, ggplot2::aes(x = 1, y = label, fill = percent)) + + ggplot2::geom_tile(color = "white", width = 0.95, height = 0.9) + + ggplot2::geom_text(ggplot2::aes(label = paste0(percent, "%")), size = 4, color = "black", fontface = "bold") + + ggplot2::scale_fill_gradient2( + low = "#f7fcf5", mid = "#74c476", high = "#00441b", + midpoint = 85, limits = c(0, 100), name = "% indicators pass" + ) + + ggplot2::scale_x_continuous(expand = c(0, 0)) + + ggplot2::labs( + title = "National coherence overview", + subtitle = "Percentage of indicators meeting each coherence criterion", + x = NULL, y = NULL + ) + + ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme( + axis.text.x = ggplot2::element_blank(), + axis.ticks = ggplot2::element_blank(), + panel.grid = ggplot2::element_blank(), + legend.position = "right", + plot.title = ggplot2::element_text(face = "bold"), + plot.subtitle = ggplot2::element_text(color = "gray30"), + axis.text.y = ggplot2::element_text(face = "bold") + ) } compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { - df_checks <- df %>% - dplyr::mutate( - !!!lapply(names(checks), function(check_name) { - cols <- checks[[check_name]] - if (all(cols %in% names(df))) { - rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) - } else { - rlang::expr(NA_real_) - } - }) %>% stats::setNames(paste0("check_", names(checks))) - ) - - check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] - valid_checks <- check_cols[ - purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) - ] - - adm_coherence <- df_checks %>% - dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% - dplyr::summarise( - total_reports = dplyr::n(), - !!!purrr::map( - valid_checks, - ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) - ) %>% - stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), - .groups = "drop" - ) %>% - dplyr::filter(.data$total_reports >= min_reports) - - adm_long <- adm_coherence %>% - tidyr::pivot_longer( - cols = dplyr::starts_with("pct_coherent_"), - names_to = "check_type", - values_to = "pct_coherent" - ) %>% - dplyr::filter(!is.na(.data$pct_coherent)) %>% - dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) - - list(adm_coherence = adm_coherence, adm_long = adm_long) + ADM_levels <- c("ADM1_NAME", "ADM2_NAME", "OU_NAME") + adm_long <- lapply(ADM_levels, function(level) { + df %>% + dplyr::filter(!is.na(.data[[level]]), !is.na(INDICATOR)) %>% + dplyr::group_by(.data[[level]], INDICATOR) %>% + dplyr::summarise( + dplyr::across(dplyr::all_of(names(checks)), ~ mean(checks[[cur_column()]](.x), na.rm = TRUE)), + n_reports = dplyr::n(), + .groups = "drop" + ) %>% + dplyr::filter(n_reports >= min_reports) %>% + tidyr::pivot_longer(cols = dplyr::all_of(names(checks)), names_to = "check", values_to = "coherence_rate") %>% + dplyr::mutate(level = level, label = check_labels[check]) + }) %>% dplyr::bind_rows() + adm_long$label <- factor(adm_long$label, levels = rev(check_labels)) + adm_long } - diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R deleted file mode 100644 index 8642d85..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R +++ /dev/null @@ -1,45 +0,0 @@ -# Shared bootstrap for outliers notebooks. -bootstrap_outliers_context <- function( - root_path = "~/workspace", - required_packages = c( - "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", - "reticulate", "glue", "zoo" - ), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - config_path <- file.path(root_path, "configuration") - data_path <- file.path(root_path, "data") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(required_packages) - - Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") - Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - - openhexa <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - } - - config_json <- tryCatch( - { - jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) - }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") - log_msg(msg) - stop(msg) - } - ) - - return(list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - CONFIG_PATH = config_path, - DATA_PATH = data_path, - openhexa = openhexa, - config_json = config_json - )) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R deleted file mode 100644 index 72f70e7..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R +++ /dev/null @@ -1,39 +0,0 @@ -impute_outliers_dt <- function(dt, outlier_col) { - dt <- data.table::as.data.table(dt) - data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) - dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] - dt[, MOVING_AVG := data.table::frollapply( - TO_IMPUTE, - n = 3, - FUN = function(x) ceiling(mean(x, na.rm = TRUE)), - align = "center" - ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] - dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] - dt[, c("TO_IMPUTE") := NULL] - return(as.data.frame(data.table::copy(dt))) -} - -format_routine_data_selection <- function( - df, - outlier_column, - DHIS2_INDICATORS, - fixed_cols, - pyramid_names, - remove = FALSE -) { - if (remove) { - df <- df %>% dplyr::filter(!.data[[outlier_column]]) - } - target_cols <- c( - "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", - "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS - ) - output <- df %>% - dplyr::select(-VALUE) %>% - dplyr::rename(VALUE = VALUE_IMPUTED) %>% - dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% - dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% - tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% - dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) - return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R deleted file mode 100644 index 719f4f6..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R +++ /dev/null @@ -1,124 +0,0 @@ -printdim <- function(df, name = deparse(substitute(df))) { - cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") -} - -plot_outliers <- function(ind_name, df, outlier_col) { - df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) - df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) - ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + - ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - ggplot2::aes(x = YEAR, y = VALUE), - color = "red", - size = 2.8, - alpha = 0.85, - na.rm = TRUE - ) + - ggplot2::labs( - title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), - subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", - x = "Année", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 14) -} - -plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { - df_ind <- df %>% - dplyr::filter( - INDICATOR == ind_name, - !is.na(YEAR), - !is.na(VALUE), - is.finite(VALUE) - ) - if (nrow(df_ind) == 0) { - return(NULL) - } - ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + - ggplot2::geom_point(color = "grey60", alpha = 0.3) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - color = "red", - size = 2.8, - alpha = 0.85 - ) + - ggplot2::facet_wrap(~ YEAR, scales = "free_y") + - ggplot2::labs( - title = paste("Détection des valeurs aberrantes —", ind_name), - subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), - x = "District (ADM2)", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 13) + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7)) -} - -plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { - if (!agg_level %in% names(df)) { - stop(paste0("Aggregation level '", agg_level, "' not found in data!")) - } - df_year <- df %>% - dplyr::filter(YEAR == selected_year) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% - dplyr::summarise(pct_coherent = mean(pct_coherent, na.rm = TRUE), .groups = "drop") %>% - dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% - dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% - dplyr::ungroup() %>% - dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) - n_units <- dplyr::n_distinct(df_year[[agg_level]]) - plot_height <- max(6, 0.5 * n_units) - agg_label <- if (agg_level == "ADM1_NAME") "niveau administratif 1" else if (agg_level == "ADM2_NAME") "niveau administratif 2" else agg_level - p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + - ggplot2::geom_tile(color = "white", linewidth = 0.2) + - ggplot2::geom_text(ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), size = 5, fontface = "bold", color = "white") + - viridis::scale_fill_viridis(name = "% cohérent", limits = c(0, 100), option = "viridis", direction = -1) + - ggplot2::labs(title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), x = "Règle de cohérence", y = agg_label) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - axis.text.y = ggplot2::element_text(size = 12), - axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), - plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), - legend.title = ggplot2::element_text(size = 12), - legend.text = ggplot2::element_text(size = 10) - ) - options(repr.plot.width = 14, repr.plot.height = plot_height) - if (!is.null(filename)) { - ggplot2::ggsave(filename = filename, plot = p, width = 14, height = plot_height, dpi = 300, limitsize = FALSE) - } - if (do_plot) { - print(p) - } -} - -plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { - if (!col_name %in% names(map_data)) { - stop(paste0("Column '", col_name, "' not found in the data!")) - } - if (is.null(indicator_label)) { - indicator_label <- col_name - } - ggplot2::ggplot(map_data) + - ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + - viridis::scale_fill_viridis( - name = paste0("% cohérence\n(", indicator_label, ")"), - option = "magma", - direction = -1, - limits = c(0, 100), - na.value = "grey90" - ) + - ggplot2::facet_wrap(~ YEAR, drop = TRUE) + - ggplot2::labs( - title = "Cohérence des données par niveau administratif 2 et par année", - subtitle = paste("Indicateur :", indicator_label), - caption = "Source : DHIS2 données routinières" - ) + - ggplot2::theme_minimal(base_size = 15) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - strip.text = ggplot2::element_text(size = 14, face = "bold"), - plot.title = ggplot2::element_text(size = 20, face = "bold"), - legend.position = "right" - ) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r index b5052dd..3b46c55 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r @@ -1,6 +1,86 @@ # Main helpers for median outliers imputation pipeline. -.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) -source(file.path(.this_dir, "imputation_utils.R")) +bootstrap_outliers_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} + +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + dt[, MEDIAN_IMPUTED := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(median(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MEDIAN_IMPUTED, TO_IMPUTE)] + dt[, c("TO_IMPUTE", "MEDIAN_IMPUTED") := NULL] + return(as.data.frame(data.table::copy(dt))) +} + +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r index 393a589..52096f9 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r @@ -1,150 +1,180 @@ # Report helpers for median outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) .this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) -source(file.path(.this_dir, "reporting_utils.R")) +source(file.path(.this_dir, "snt_dhis2_outliers_imputation_median.r")) -get_coherence_definitions <- function() { - checks <- list( - allout_susp = c("ALLOUT", "SUSP"), - allout_test = c("ALLOUT", "TEST"), - susp_test = c("SUSP", "TEST"), - test_conf = c("TEST", "CONF"), - conf_treat = c("CONF", "MALTREAT"), - adm_dth = c("MALADM", "MALDTH") - ) +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} - check_labels <- c( - pct_coherent_allout_susp = "Ambulatoire >= Suspects", - pct_coherent_allout_test = "Ambulatoire >= Testes", - pct_coherent_susp_test = "Suspects >= Testes", - pct_coherent_test_conf = "Testes >= Confirmes", - pct_coherent_conf_treat = "Confirmes >= Traites", - pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" - ) +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Outliers for indicator:", ind_name), + subtitle = "Grey = all values, red = detected outliers", + x = "Year", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 14) +} - list(checks = checks, check_labels = check_labels) +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + if (nrow(df_ind) == 0) { + return(NULL) + } + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Outliers by district and year:", ind_name), + x = "District", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 12) } -compute_national_coherency_metrics <- function(df, checks, check_labels) { - df_checks <- df %>% - dplyr::mutate( - !!!lapply(names(checks), function(check_name) { - cols <- checks[[check_name]] - if (all(cols %in% names(df))) { - rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) - } else { - rlang::expr(NA) - } - }) %>% stats::setNames(paste0("check_", names(checks))) - ) +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!(agg_level %in% c("ADM1_NAME", "ADM2_NAME"))) stop("agg_level must be ADM1_NAME or ADM2_NAME") + if (!all(c("INDICATOR", "YEAR", agg_level, "VALUE", "VALUE_IMPUTED") %in% colnames(df))) { + stop("Data frame is missing required columns.") + } + comp <- df %>% + dplyr::filter(YEAR == selected_year) %>% + dplyr::group_by(INDICATOR, !!rlang::sym(agg_level)) %>% + dplyr::summarise( + coherence = ifelse(sum(!is.na(VALUE)) == 0, NA, sum(VALUE == VALUE_IMPUTED, na.rm = TRUE) / sum(!is.na(VALUE))), + n = dplyr::n(), + .groups = "drop" + ) + p <- ggplot2::ggplot(comp, ggplot2::aes(x = .data[[agg_level]], y = INDICATOR, fill = coherence)) + + ggplot2::geom_tile(color = "white", linewidth = 0.2) + + ggplot2::scale_fill_gradient(low = "#fee5d9", high = "#a50f15", na.value = "grey90", limits = c(0, 1)) + + ggplot2::labs( + title = paste("Coherence heatmap -", agg_level, "-", selected_year), + x = agg_level, + y = "Indicator", + fill = "Coherence" + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + if (!is.null(filename)) ggplot2::ggsave(filename, p, width = 12, height = 6) + if (isTRUE(do_plot)) print(p) + invisible(p) +} - check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!inherits(map_data, "sf")) stop("map_data must be an sf object.") + if (!(col_name %in% names(map_data))) stop(paste("Column", col_name, "not found in map_data.")) + ttl <- ifelse(is.null(indicator_label), paste("Map of", col_name), paste("Map of", col_name, "-", indicator_label)) + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "grey30", linewidth = 0.1) + + ggplot2::scale_fill_viridis_c(option = "C", na.value = "grey90") + + ggplot2::labs(title = ttl, fill = col_name) + + ggplot2::theme_minimal(base_size = 12) +} - df_checks %>% - dplyr::group_by(.data$YEAR) %>% - dplyr::summarise( - dplyr::across( - dplyr::all_of(check_cols), - ~ mean(.x, na.rm = TRUE) * 100, - .names = "pct_{.col}" - ), - .groups = "drop" - ) %>% - tidyr::pivot_longer( - cols = dplyr::starts_with("pct_"), - names_to = "check_type", - names_prefix = "pct_check_", - values_to = "pct_coherent" - ) %>% - dplyr::filter(!is.na(.data$pct_coherent)) %>% - dplyr::mutate( - check_label = dplyr::recode( - .data$check_type, - !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) - ), - check_label = factor(.data$check_label, levels = unique(.data$check_label)), - check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) +get_coherence_definitions <- function() { + checks <- list( + "long_term" = function(x) (x >= 0.95), + "short_term" = function(x) (x >= 0.95), + "cyclicality" = function(x) (x >= 0.90), + "volatility" = function(x) (x >= 0.90), + "rolling_sd" = function(x) (x <= 0.80), + "spatial" = function(x) (x <= 0.80), + "residual" = function(x) (x <= 2), + "trend_strength" = function(x) (x >= 0.20) ) + check_labels <- c( + "long_term" = "Long-term (>= 95%)", + "short_term" = "Short-term (>= 95%)", + "cyclicality" = "Cyclicality (>= 90%)", + "volatility" = "Volatility (>= 90%)", + "rolling_sd" = "Rolling SD (<= 80%)", + "spatial" = "Spatial (<= 80%)", + "residual" = "Residual (<= 2)", + "trend_strength" = "Trend strength (>= 20%)" + ) + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + coherency_metrics <- purrr::imap_dfr(checks, function(cond, check_name) { + vals <- df[[check_name]] + tibble::tibble( + check = check_name, + label = check_labels[[check_name]], + percent = round(100 * mean(cond(vals), na.rm = TRUE), 1) + ) + }) + coherency_metrics$label <- factor(coherency_metrics$label, levels = rev(check_labels)) + coherency_metrics } plot_national_coherence_heatmap <- function(coherency_metrics) { - ggplot2::ggplot(coherency_metrics, ggplot2::aes( - x = factor(.data$YEAR), - y = .data$check_label, - fill = .data$pct_coherent - )) + - ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + - ggplot2::geom_text( - ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), - color = "white", - fontface = "bold", - size = 5 - ) + - viridis::scale_fill_viridis( - name = "% Coherent", - option = "viridis", - limits = c(0, 100), - direction = -1 - ) + - ggplot2::labs( - title = "Controles de coherence des donnees (niveau national)", - x = "Annee", - y = NULL - ) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), - axis.text.y = ggplot2::element_text(size = 16, hjust = 0), - axis.text.x = ggplot2::element_text(size = 16), - legend.title = ggplot2::element_text(size = 16, face = "bold"), - legend.text = ggplot2::element_text(size = 14), - legend.key.width = grid::unit(0.7, "cm"), - legend.key.height = grid::unit(1.2, "cm") - ) + ggplot2::ggplot(coherency_metrics, ggplot2::aes(x = 1, y = label, fill = percent)) + + ggplot2::geom_tile(color = "white", width = 0.95, height = 0.9) + + ggplot2::geom_text(ggplot2::aes(label = paste0(percent, "%")), size = 4, color = "black", fontface = "bold") + + ggplot2::scale_fill_gradient2( + low = "#f7fcf5", mid = "#74c476", high = "#00441b", + midpoint = 85, limits = c(0, 100), name = "% indicators pass" + ) + + ggplot2::scale_x_continuous(expand = c(0, 0)) + + ggplot2::labs( + title = "National coherence overview", + subtitle = "Percentage of indicators meeting each coherence criterion", + x = NULL, y = NULL + ) + + ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme( + axis.text.x = ggplot2::element_blank(), + axis.ticks = ggplot2::element_blank(), + panel.grid = ggplot2::element_blank(), + legend.position = "right", + plot.title = ggplot2::element_text(face = "bold"), + plot.subtitle = ggplot2::element_text(color = "gray30"), + axis.text.y = ggplot2::element_text(face = "bold") + ) } compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { - df_checks <- df %>% - dplyr::mutate( - !!!lapply(names(checks), function(check_name) { - cols <- checks[[check_name]] - if (all(cols %in% names(df))) { - rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) - } else { - rlang::expr(NA_real_) - } - }) %>% stats::setNames(paste0("check_", names(checks))) - ) - - check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] - valid_checks <- check_cols[ - purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) - ] - - adm_coherence <- df_checks %>% - dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% - dplyr::summarise( - total_reports = dplyr::n(), - !!!purrr::map( - valid_checks, - ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) - ) %>% - stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), - .groups = "drop" - ) %>% - dplyr::filter(.data$total_reports >= min_reports) - - adm_long <- adm_coherence %>% - tidyr::pivot_longer( - cols = dplyr::starts_with("pct_coherent_"), - names_to = "check_type", - values_to = "pct_coherent" - ) %>% - dplyr::filter(!is.na(.data$pct_coherent)) %>% - dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) - - list(adm_coherence = adm_coherence, adm_long = adm_long) + ADM_levels <- c("ADM1_NAME", "ADM2_NAME", "OU_NAME") + adm_long <- lapply(ADM_levels, function(level) { + df %>% + dplyr::filter(!is.na(.data[[level]]), !is.na(INDICATOR)) %>% + dplyr::group_by(.data[[level]], INDICATOR) %>% + dplyr::summarise( + dplyr::across(dplyr::all_of(names(checks)), ~ mean(checks[[cur_column()]](.x), na.rm = TRUE)), + n_reports = dplyr::n(), + .groups = "drop" + ) %>% + dplyr::filter(n_reports >= min_reports) %>% + tidyr::pivot_longer(cols = dplyr::all_of(names(checks)), names_to = "check", values_to = "coherence_rate") %>% + dplyr::mutate(level = level, label = check_labels[check]) + }) %>% dplyr::bind_rows() + adm_long$label <- factor(adm_long$label, levels = rev(check_labels)) + adm_long } - From f14ac32fc27317f7d2c8da61e1048005c1545ade Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 26 Mar 2026 14:45:27 +0100 Subject: [PATCH 09/17] Update helper file paths in outlier imputation and reporting notebooks to reflect new naming conventions. This change enhances clarity and consistency across the pipeline. --- .../snt_dhis2_outliers_imputation_magic_glasses.ipynb | 2 +- .../code/snt_dhis2_outliers_imputation_mean.ipynb | 4 ++-- .../snt_dhis2_outliers_imputation_mean_report.ipynb | 8 ++++---- .../code/snt_dhis2_outliers_imputation_median.ipynb | 4 ++-- .../snt_dhis2_outliers_imputation_median_report.ipynb | 8 ++++---- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index 211d395..cfdccf2 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -141,7 +141,7 @@ }, "outputs": [], "source": [ - "# Helpers loaded from utils/magic_glasses_utils.R\n", + "# Helpers loaded from utils/snt_dhis2_outliers_imputation_magic_glasses.r\n", "# - detect_outliers_mad_custom()\n", "# - detect_seasonal_outliers()" ] diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index cdefca5..9413267 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -512,7 +512,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R\n", + "# Helper loaded from utils/snt_dhis2_outliers_imputation_mean.r\n", "start_time <- Sys.time()" ] }, @@ -590,7 +590,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_mean.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb index ca1ba0d..c650d21 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb @@ -85,7 +85,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/reporting_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_mean_report.r" ] }, { @@ -219,7 +219,7 @@ }, "outputs": [], "source": [ - "# Plot helpers loaded from utils/reporting_utils.R\n", + "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_mean_report.r\n", "# - plot_outliers()\n", "# - plot_outliers_by_district_facet_year()" ] @@ -549,7 +549,7 @@ }, "outputs": [], "source": [ - "# Coherence heatmap helper loaded from utils/reporting_utils.R" + "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_mean_report.r" ] }, { @@ -616,7 +616,7 @@ }, "outputs": [], "source": [ - "# Coherence map helper loaded from utils/reporting_utils.R" + "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_mean_report.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb index a16e999..b6cd2c7 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb @@ -522,7 +522,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R\n", + "# Helper loaded from utils/snt_dhis2_outliers_imputation_median.r\n", "start_time <- Sys.time()" ] }, @@ -600,7 +600,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_median.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb index 854ca4e..685f278 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb @@ -85,7 +85,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/reporting_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_median_report.r" ] }, { @@ -219,7 +219,7 @@ }, "outputs": [], "source": [ - "# Plot helpers loaded from utils/reporting_utils.R\n", + "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_median_report.r\n", "# - plot_outliers()\n", "# - plot_outliers_by_district_facet_year()" ] @@ -549,7 +549,7 @@ }, "outputs": [], "source": [ - "# Coherence heatmap helper loaded from utils/reporting_utils.R" + "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_median_report.r" ] }, { @@ -616,7 +616,7 @@ }, "outputs": [], "source": [ - "# Coherence map helper loaded from utils/reporting_utils.R" + "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_median_report.r" ] }, { From 1dbb87fa98d15dbcf31406f826dad87db5481d53 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 26 Mar 2026 15:18:36 +0100 Subject: [PATCH 10/17] fix --- .../snt_dhis2_outliers_imputation_magic_glasses_report.r | 8 +++++++- .../utils/snt_dhis2_outliers_imputation_mean_report.r | 8 +++++++- .../utils/snt_dhis2_outliers_imputation_median_report.r | 8 +++++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r index 8c531c5..b754d07 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r @@ -1,5 +1,11 @@ # Report helpers for magic glasses outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") +} else if (!is.na(.this_file)) { + dirname(.this_file) +} else { + getwd() +} source(file.path(.this_dir, "snt_dhis2_outliers_imputation_magic_glasses.r")) diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r index e9dd658..8aca814 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r @@ -1,6 +1,12 @@ # Report helpers for mean outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") +} else if (!is.na(.this_file)) { + dirname(.this_file) +} else { + getwd() +} source(file.path(.this_dir, "snt_dhis2_outliers_imputation_mean.r")) printdim <- function(df, name = deparse(substitute(df))) { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r index 52096f9..9815e39 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r @@ -1,6 +1,12 @@ # Report helpers for median outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") +} else if (!is.na(.this_file)) { + dirname(.this_file) +} else { + getwd() +} source(file.path(.this_dir, "snt_dhis2_outliers_imputation_median.r")) printdim <- function(df, name = deparse(substitute(df))) { From b5f9e3d02360a7a1202d3661300ec09e1efaf921 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 26 Mar 2026 17:24:12 +0100 Subject: [PATCH 11/17] still bugged --- ...outliers_imputation_magic_glasses_report.r | 27 +- ...nt_dhis2_outliers_imputation_mean_report.r | 310 ++++++++++++------ ..._dhis2_outliers_imputation_median_report.r | 310 ++++++++++++------ 3 files changed, 456 insertions(+), 191 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r index b754d07..c3a9df6 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r @@ -1,11 +1,24 @@ # Report helpers for magic glasses outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { - file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") -} else if (!is.na(.this_file)) { - dirname(.this_file) -} else { - getwd() +.candidate_files <- unique(c( + if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils", "snt_dhis2_outliers_imputation_magic_glasses.r") + } else { + character(0) + }, + if (!is.na(.this_file)) { + file.path(dirname(.this_file), "snt_dhis2_outliers_imputation_magic_glasses.r") + } else { + character(0) + }, + file.path(getwd(), "snt_dhis2_outliers_imputation_magic_glasses.r") +)) +.target_file <- .candidate_files[file.exists(.candidate_files)][1] +if (is.na(.target_file)) { + stop(paste0( + "Could not locate snt_dhis2_outliers_imputation_magic_glasses.r. Tried: ", + paste(.candidate_files, collapse = " | ") + )) } -source(file.path(.this_dir, "snt_dhis2_outliers_imputation_magic_glasses.r")) +source(.target_file) diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r index 8aca814..ce792ef 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r @@ -1,13 +1,28 @@ # Report helpers for mean outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { - file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") -} else if (!is.na(.this_file)) { - dirname(.this_file) -} else { - getwd() +.candidate_files <- unique(c( + if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils", "snt_dhis2_outliers_imputation_mean.r") + } else { + character(0) + }, + if (!is.na(.this_file)) { + file.path(dirname(.this_file), "snt_dhis2_outliers_imputation_mean.r") + } else { + character(0) + }, + file.path(getwd(), "snt_dhis2_outliers_imputation_mean.r") +)) +.target_file <- .candidate_files[file.exists(.candidate_files)][1] +if (is.na(.target_file)) { + stop(paste0( + "Could not locate snt_dhis2_outliers_imputation_mean.r. Tried: ", + paste(.candidate_files, collapse = " | ") + )) } -source(file.path(.this_dir, "snt_dhis2_outliers_imputation_mean.r")) +source(.target_file) + +`%||%` <- function(x, y) if (!is.null(x)) x else y printdim <- function(df, name = deparse(substitute(df))) { cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") @@ -64,123 +79,234 @@ plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { } plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { - if (!(agg_level %in% c("ADM1_NAME", "ADM2_NAME"))) stop("agg_level must be ADM1_NAME or ADM2_NAME") - if (!all(c("INDICATOR", "YEAR", agg_level, "VALUE", "VALUE_IMPUTED") %in% colnames(df))) { - stop("Data frame is missing required columns.") - } - comp <- df %>% - dplyr::filter(YEAR == selected_year) %>% - dplyr::group_by(INDICATOR, !!rlang::sym(agg_level)) %>% - dplyr::summarise( - coherence = ifelse(sum(!is.na(VALUE)) == 0, NA, sum(VALUE == VALUE_IMPUTED, na.rm = TRUE) / sum(!is.na(VALUE))), - n = dplyr::n(), - .groups = "drop" + if (!all(c("YEAR", "check_label", "pct_coherent") %in% names(df))) return(NULL) + if (!agg_level %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate(YEAR = as.integer(.data$YEAR)) %>% + dplyr::filter(.data$YEAR == as.integer(selected_year)) %>% + dplyr::mutate( + agg = as.character(.data[[agg_level]]), + check_label = as.character(.data$check_label) ) - p <- ggplot2::ggplot(comp, ggplot2::aes(x = .data[[agg_level]], y = INDICATOR, fill = coherence)) + - ggplot2::geom_tile(color = "white", linewidth = 0.2) + - ggplot2::scale_fill_gradient(low = "#fee5d9", high = "#a50f15", na.value = "grey90", limits = c(0, 1)) + + + if (nrow(d) == 0) return(NULL) + + p <- ggplot2::ggplot(d, ggplot2::aes( + x = .data$check_label, + y = .data$agg, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile() + + ggplot2::scale_fill_viridis_c( + name = "% coherent", + option = "viridis", + limits = c(0, 100) + ) + ggplot2::labs( - title = paste("Coherence heatmap -", agg_level, "-", selected_year), - x = agg_level, - y = "Indicator", - fill = "Coherence" + title = sprintf("Coherence (%s) - %s", agg_level, selected_year), + x = NULL, + y = NULL ) + ggplot2::theme_minimal(base_size = 12) + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) - if (!is.null(filename)) ggplot2::ggsave(filename, p, width = 12, height = 6) - if (isTRUE(do_plot)) print(p) + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), + plot.title = ggplot2::element_text(face = "bold") + ) + + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = 8, dpi = 150) + } + + if (do_plot) print(p) invisible(p) } plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { - if (!inherits(map_data, "sf")) stop("map_data must be an sf object.") - if (!(col_name %in% names(map_data))) stop(paste("Column", col_name, "not found in map_data.")) - ttl <- ifelse(is.null(indicator_label), paste("Map of", col_name), paste("Map of", col_name, "-", indicator_label)) + if (!inherits(map_data, "sf")) return(NULL) + if (!col_name %in% names(map_data)) return(NULL) + ggplot2::ggplot(map_data) + - ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "grey30", linewidth = 0.1) + - ggplot2::scale_fill_viridis_c(option = "C", na.value = "grey90") + - ggplot2::labs(title = ttl, fill = col_name) + - ggplot2::theme_minimal(base_size = 12) + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = NA) + + ggplot2::scale_fill_viridis_c( + option = "viridis", + name = indicator_label %||% col_name, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::labs(title = indicator_label %||% col_name) + + ggplot2::theme_void(base_size = 12) + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", hjust = 0.5), + legend.position = "right" + ) } get_coherence_definitions <- function() { checks <- list( - "long_term" = function(x) (x >= 0.95), - "short_term" = function(x) (x >= 0.95), - "cyclicality" = function(x) (x >= 0.90), - "volatility" = function(x) (x >= 0.90), - "rolling_sd" = function(x) (x <= 0.80), - "spatial" = function(x) (x <= 0.80), - "residual" = function(x) (x <= 2), - "trend_strength" = function(x) (x >= 0.20) + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") ) + check_labels <- c( - "long_term" = "Long-term (>= 95%)", - "short_term" = "Short-term (>= 95%)", - "cyclicality" = "Cyclicality (>= 90%)", - "volatility" = "Volatility (>= 90%)", - "rolling_sd" = "Rolling SD (<= 80%)", - "spatial" = "Spatial (<= 80%)", - "residual" = "Residual (<= 2)", - "trend_strength" = "Trend strength (>= 20%)" + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" ) + list(checks = checks, check_labels = check_labels) } compute_national_coherency_metrics <- function(df, checks, check_labels) { - coherency_metrics <- purrr::imap_dfr(checks, function(cond, check_name) { - vals <- df[[check_name]] - tibble::tibble( - check = check_name, - label = check_labels[[check_name]], - percent = round(100 * mean(cond(vals), na.rm = TRUE), 1) + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + if (length(check_cols) == 0) { + return(tibble::tibble( + YEAR = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = factor() + )) + } + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) ) - }) - coherency_metrics$label <- factor(coherency_metrics$label, levels = rev(check_labels)) - coherency_metrics } plot_national_coherence_heatmap <- function(coherency_metrics) { - ggplot2::ggplot(coherency_metrics, ggplot2::aes(x = 1, y = label, fill = percent)) + - ggplot2::geom_tile(color = "white", width = 0.95, height = 0.9) + - ggplot2::geom_text(ggplot2::aes(label = paste0(percent, "%")), size = 4, color = "black", fontface = "bold") + - ggplot2::scale_fill_gradient2( - low = "#f7fcf5", mid = "#74c476", high = "#00441b", - midpoint = 85, limits = c(0, 100), name = "% indicators pass" + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 ) + - ggplot2::scale_x_continuous(expand = c(0, 0)) + ggplot2::labs( - title = "National coherence overview", - subtitle = "Percentage of indicators meeting each coherence criterion", - x = NULL, y = NULL + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL ) + - ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme_minimal(base_size = 14) + ggplot2::theme( - axis.text.x = ggplot2::element_blank(), - axis.ticks = ggplot2::element_blank(), panel.grid = ggplot2::element_blank(), - legend.position = "right", - plot.title = ggplot2::element_text(face = "bold"), - plot.subtitle = ggplot2::element_text(color = "gray30"), - axis.text.y = ggplot2::element_text(face = "bold") + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") ) } compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { - ADM_levels <- c("ADM1_NAME", "ADM2_NAME", "OU_NAME") - adm_long <- lapply(ADM_levels, function(level) { - df %>% - dplyr::filter(!is.na(.data[[level]]), !is.na(INDICATOR)) %>% - dplyr::group_by(.data[[level]], INDICATOR) %>% - dplyr::summarise( - dplyr::across(dplyr::all_of(names(checks)), ~ mean(checks[[cur_column()]](.x), na.rm = TRUE)), - n_reports = dplyr::n(), - .groups = "drop" + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + if (length(valid_checks) == 0) { + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise(total_reports = dplyr::n(), .groups = "drop") %>% + dplyr::filter(.data$total_reports >= min_reports) + adm_long <- tibble::tibble( + ADM1_NAME = character(), + ADM2_NAME = character(), + ADM2_ID = character(), + YEAR = integer(), + total_reports = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = character() + ) + return(list(adm_coherence = adm_coherence, adm_long = adm_long)) + } + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) ) %>% - dplyr::filter(n_reports >= min_reports) %>% - tidyr::pivot_longer(cols = dplyr::all_of(names(checks)), names_to = "check", values_to = "coherence_rate") %>% - dplyr::mutate(level = level, label = check_labels[check]) - }) %>% dplyr::bind_rows() - adm_long$label <- factor(adm_long$label, levels = rev(check_labels)) - adm_long + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) } diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r index 9815e39..ea3d558 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r @@ -1,13 +1,28 @@ # Report helpers for median outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { - file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") -} else if (!is.na(.this_file)) { - dirname(.this_file) -} else { - getwd() +.candidate_files <- unique(c( + if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils", "snt_dhis2_outliers_imputation_median.r") + } else { + character(0) + }, + if (!is.na(.this_file)) { + file.path(dirname(.this_file), "snt_dhis2_outliers_imputation_median.r") + } else { + character(0) + }, + file.path(getwd(), "snt_dhis2_outliers_imputation_median.r") +)) +.target_file <- .candidate_files[file.exists(.candidate_files)][1] +if (is.na(.target_file)) { + stop(paste0( + "Could not locate snt_dhis2_outliers_imputation_median.r. Tried: ", + paste(.candidate_files, collapse = " | ") + )) } -source(file.path(.this_dir, "snt_dhis2_outliers_imputation_median.r")) +source(.target_file) + +`%||%` <- function(x, y) if (!is.null(x)) x else y printdim <- function(df, name = deparse(substitute(df))) { cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") @@ -64,123 +79,234 @@ plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { } plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { - if (!(agg_level %in% c("ADM1_NAME", "ADM2_NAME"))) stop("agg_level must be ADM1_NAME or ADM2_NAME") - if (!all(c("INDICATOR", "YEAR", agg_level, "VALUE", "VALUE_IMPUTED") %in% colnames(df))) { - stop("Data frame is missing required columns.") - } - comp <- df %>% - dplyr::filter(YEAR == selected_year) %>% - dplyr::group_by(INDICATOR, !!rlang::sym(agg_level)) %>% - dplyr::summarise( - coherence = ifelse(sum(!is.na(VALUE)) == 0, NA, sum(VALUE == VALUE_IMPUTED, na.rm = TRUE) / sum(!is.na(VALUE))), - n = dplyr::n(), - .groups = "drop" + if (!all(c("YEAR", "check_label", "pct_coherent") %in% names(df))) return(NULL) + if (!agg_level %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate(YEAR = as.integer(.data$YEAR)) %>% + dplyr::filter(.data$YEAR == as.integer(selected_year)) %>% + dplyr::mutate( + agg = as.character(.data[[agg_level]]), + check_label = as.character(.data$check_label) ) - p <- ggplot2::ggplot(comp, ggplot2::aes(x = .data[[agg_level]], y = INDICATOR, fill = coherence)) + - ggplot2::geom_tile(color = "white", linewidth = 0.2) + - ggplot2::scale_fill_gradient(low = "#fee5d9", high = "#a50f15", na.value = "grey90", limits = c(0, 1)) + + + if (nrow(d) == 0) return(NULL) + + p <- ggplot2::ggplot(d, ggplot2::aes( + x = .data$check_label, + y = .data$agg, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile() + + ggplot2::scale_fill_viridis_c( + name = "% coherent", + option = "viridis", + limits = c(0, 100) + ) + ggplot2::labs( - title = paste("Coherence heatmap -", agg_level, "-", selected_year), - x = agg_level, - y = "Indicator", - fill = "Coherence" + title = sprintf("Coherence (%s) - %s", agg_level, selected_year), + x = NULL, + y = NULL ) + ggplot2::theme_minimal(base_size = 12) + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) - if (!is.null(filename)) ggplot2::ggsave(filename, p, width = 12, height = 6) - if (isTRUE(do_plot)) print(p) + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), + plot.title = ggplot2::element_text(face = "bold") + ) + + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = 8, dpi = 150) + } + + if (do_plot) print(p) invisible(p) } plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { - if (!inherits(map_data, "sf")) stop("map_data must be an sf object.") - if (!(col_name %in% names(map_data))) stop(paste("Column", col_name, "not found in map_data.")) - ttl <- ifelse(is.null(indicator_label), paste("Map of", col_name), paste("Map of", col_name, "-", indicator_label)) + if (!inherits(map_data, "sf")) return(NULL) + if (!col_name %in% names(map_data)) return(NULL) + ggplot2::ggplot(map_data) + - ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "grey30", linewidth = 0.1) + - ggplot2::scale_fill_viridis_c(option = "C", na.value = "grey90") + - ggplot2::labs(title = ttl, fill = col_name) + - ggplot2::theme_minimal(base_size = 12) + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = NA) + + ggplot2::scale_fill_viridis_c( + option = "viridis", + name = indicator_label %||% col_name, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::labs(title = indicator_label %||% col_name) + + ggplot2::theme_void(base_size = 12) + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", hjust = 0.5), + legend.position = "right" + ) } get_coherence_definitions <- function() { checks <- list( - "long_term" = function(x) (x >= 0.95), - "short_term" = function(x) (x >= 0.95), - "cyclicality" = function(x) (x >= 0.90), - "volatility" = function(x) (x >= 0.90), - "rolling_sd" = function(x) (x <= 0.80), - "spatial" = function(x) (x <= 0.80), - "residual" = function(x) (x <= 2), - "trend_strength" = function(x) (x >= 0.20) + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") ) + check_labels <- c( - "long_term" = "Long-term (>= 95%)", - "short_term" = "Short-term (>= 95%)", - "cyclicality" = "Cyclicality (>= 90%)", - "volatility" = "Volatility (>= 90%)", - "rolling_sd" = "Rolling SD (<= 80%)", - "spatial" = "Spatial (<= 80%)", - "residual" = "Residual (<= 2)", - "trend_strength" = "Trend strength (>= 20%)" + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" ) + list(checks = checks, check_labels = check_labels) } compute_national_coherency_metrics <- function(df, checks, check_labels) { - coherency_metrics <- purrr::imap_dfr(checks, function(cond, check_name) { - vals <- df[[check_name]] - tibble::tibble( - check = check_name, - label = check_labels[[check_name]], - percent = round(100 * mean(cond(vals), na.rm = TRUE), 1) + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + if (length(check_cols) == 0) { + return(tibble::tibble( + YEAR = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = factor() + )) + } + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) ) - }) - coherency_metrics$label <- factor(coherency_metrics$label, levels = rev(check_labels)) - coherency_metrics } plot_national_coherence_heatmap <- function(coherency_metrics) { - ggplot2::ggplot(coherency_metrics, ggplot2::aes(x = 1, y = label, fill = percent)) + - ggplot2::geom_tile(color = "white", width = 0.95, height = 0.9) + - ggplot2::geom_text(ggplot2::aes(label = paste0(percent, "%")), size = 4, color = "black", fontface = "bold") + - ggplot2::scale_fill_gradient2( - low = "#f7fcf5", mid = "#74c476", high = "#00441b", - midpoint = 85, limits = c(0, 100), name = "% indicators pass" + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 ) + - ggplot2::scale_x_continuous(expand = c(0, 0)) + ggplot2::labs( - title = "National coherence overview", - subtitle = "Percentage of indicators meeting each coherence criterion", - x = NULL, y = NULL + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL ) + - ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme_minimal(base_size = 14) + ggplot2::theme( - axis.text.x = ggplot2::element_blank(), - axis.ticks = ggplot2::element_blank(), panel.grid = ggplot2::element_blank(), - legend.position = "right", - plot.title = ggplot2::element_text(face = "bold"), - plot.subtitle = ggplot2::element_text(color = "gray30"), - axis.text.y = ggplot2::element_text(face = "bold") + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") ) } compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { - ADM_levels <- c("ADM1_NAME", "ADM2_NAME", "OU_NAME") - adm_long <- lapply(ADM_levels, function(level) { - df %>% - dplyr::filter(!is.na(.data[[level]]), !is.na(INDICATOR)) %>% - dplyr::group_by(.data[[level]], INDICATOR) %>% - dplyr::summarise( - dplyr::across(dplyr::all_of(names(checks)), ~ mean(checks[[cur_column()]](.x), na.rm = TRUE)), - n_reports = dplyr::n(), - .groups = "drop" + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + if (length(valid_checks) == 0) { + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise(total_reports = dplyr::n(), .groups = "drop") %>% + dplyr::filter(.data$total_reports >= min_reports) + adm_long <- tibble::tibble( + ADM1_NAME = character(), + ADM2_NAME = character(), + ADM2_ID = character(), + YEAR = integer(), + total_reports = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = character() + ) + return(list(adm_coherence = adm_coherence, adm_long = adm_long)) + } + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) ) %>% - dplyr::filter(n_reports >= min_reports) %>% - tidyr::pivot_longer(cols = dplyr::all_of(names(checks)), names_to = "check", values_to = "coherence_rate") %>% - dplyr::mutate(level = level, label = check_labels[check]) - }) %>% dplyr::bind_rows() - adm_long$label <- factor(adm_long$label, levels = rev(check_labels)) - adm_long + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) } From 615fab0180a17030c14012cbc2fb5d4167d4df7e Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 27 Mar 2026 13:45:33 +0100 Subject: [PATCH 12/17] fix --- ...s2_outliers_imputation_magic_glasses.ipynb | 37 +++++++++- .../snt_dhis2_outliers_imputation_mean.ipynb | 68 ++++++++++--------- 2 files changed, 72 insertions(+), 33 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index cfdccf2..be13bd5 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -93,13 +93,34 @@ " log_msg(glue::glue(\"Using parallel seasonal detection with {SEASONAL_WORKERS} workers\"))\n", "}\n", "\n", - "config_json <- fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME))\n", + "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME))\n", + "\n", + "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\")\n", + "for (conf in snt_config_mandatory) {\n", + " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", + " msg <- paste(\"Missing configuration input:\", conf)\n", + " log_msg(msg)\n", + " stop(msg)\n", + " }\n", + "}\n", + "\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "fixed_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\")\n", "indicators_to_keep <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", "\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\"))\n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- glue::glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\")\n", + " log_msg(msg)\n", + " stop(msg)\n", + "})\n", + "log_msg(glue::glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", + "log_msg(glue::glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", + "\n", + "if (all(c(\"YEAR\", \"MONTH\") %in% names(dhis2_routine))) {\n", + " dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)\n", + "}\n", "\n", "cols_to_select <- intersect(c(fixed_cols, indicators_to_keep), names(dhis2_routine))\n", "dt_routine <- as.data.table(dhis2_routine)[, ..cols_to_select]\n", @@ -113,6 +134,18 @@ " variable.factor = FALSE\n", ")\n", "\n", + "# Remove duplicated values (same strategy as mean/median)\n", + "dup_keys <- c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"YEAR\", \"MONTH\", \"INDICATOR\")\n", + "dup_keys <- intersect(dup_keys, names(dhis2_routine_long))\n", + "if (length(dup_keys) > 0) {\n", + " duplicated <- dhis2_routine_long[, .N, by = dup_keys][N > 1L]\n", + " if (nrow(duplicated) > 0) {\n", + " log_msg(glue::glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", + " data.table::setkeyv(dhis2_routine_long, dup_keys)\n", + " dhis2_routine_long <- unique(dhis2_routine_long)\n", + " }\n", + "}\n", + "\n", "if (DEV_SUBSET) {\n", " unique_adm1 <- unique(dhis2_routine_long$ADM1_ID)\n", " adm1_to_keep <- unique_adm1[seq_len(min(DEV_SUBSET_ADM1_N, length(unique_adm1)))]\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index 9413267..2a4ac29 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -131,12 +131,15 @@ }, "outputs": [], "source": [ - "# Check SNT configuration (shared helper)\n", - "validate_required_config_keys(\n", - " config_json = config_json,\n", - " keys = c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\"),\n", - " section = \"SNT_CONFIG\"\n", - ")\n", + "# Check SNT configuration \n", + "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") \n", + "for (conf in snt_config_mandatory) {\n", + " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", + " msg <- paste(\"Missing configuration input:\", conf)\n", + " log_msg(msg)\n", + " stop(msg)\n", + " }\n", + "}\n", "\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", @@ -174,14 +177,16 @@ }, "outputs": [], "source": [ - "# Load file from dataset (formatting) using shared helper\n", + "# Load file from dataset (formatting)\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- load_country_file_from_dataset(\n", - " dataset_id = dataset_name,\n", - " country_code = COUNTRY_CODE,\n", - " suffix = \"_routine.parquet\",\n", - " label = \"DHIS2 routine data\"\n", - ")\n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", + " log_msg(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", "\n", "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", @@ -199,9 +204,8 @@ }, "outputs": [], "source": [ - "# YEAR and MONTH should be integers\n", - "# Use shared helper to normalize types when columns exist\n", - "dhis2_routine <- normalize_year_month_types(dhis2_routine)" + "# YEAR and MONTH should be integers; in the input data they are numeric, but we later use them as integers\n", + "dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)" ] }, { @@ -262,13 +266,9 @@ }, "outputs": [], "source": [ - "# Use shared helper to select, pivot and deduplicate routine rows\n", - "dhis2_routine_long <- prepare_routine_long(\n", - " routine_df = dhis2_routine,\n", - " fixed_cols = fixed_cols,\n", - " indicators = DHIS2_INDICATORS,\n", - " deduplicate = TRUE\n", - ")\n", + "dhis2_routine_long <- dhis2_routine %>%\n", + " select(all_of(c(fixed_cols, DHIS2_INDICATORS))) %>%\n", + " pivot_longer(cols = all_of(DHIS2_INDICATORS), names_to = \"INDICATOR\", values_to = \"VALUE\")\n", "\n", "print(dim(dhis2_routine_long))\n", "head(dhis2_routine_long, 2)" @@ -293,8 +293,19 @@ }, "outputs": [], "source": [ - "# Duplicates are handled by prepare_routine_long(..., deduplicate = TRUE)\n", - "log_msg(\"Routine long data prepared with shared helper (deduplication applied).\")" + "# check if there are any duplicates\n", + "duplicated <- dhis2_routine_long %>%\n", + " group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, YEAR, MONTH, INDICATOR) %>%\n", + " summarise(n = dplyr::n(), .groups= \"drop\") %>%\n", + " filter(n > 1L)\n", + "\n", + "# Remove dups\n", + "if (nrow(duplicated) > 0) {\n", + " log_msg(glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", + " dhis2_routine_long <- dhis2_routine_long %>%\n", + " distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, YEAR, MONTH, INDICATOR, .keep_all = TRUE)\n", + " head(duplicated)\n", + "}" ] }, { @@ -672,12 +683,7 @@ }, "outputs": [], "source": [ - "output_path <- standard_output_path(\n", - " data_root_path = DATA_PATH,\n", - " domain = \"dhis2\",\n", - " subdomain = \"outliers_imputation\",\n", - " create_dir = TRUE\n", - ")\n", + "output_path <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", "\n", "# Mean detection table (for DB and reporting)\n", "outlier_col <- colnames(dhis2_routine_outliers_selection)[startsWith(colnames(dhis2_routine_outliers_selection), \"OUTLIER_\")][1]\n", From a4d521997ac43cee6e27ed73b3f701249502b4da Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 27 Mar 2026 14:42:14 +0100 Subject: [PATCH 13/17] last fix --- .../snt_dhis2_outliers_imputation_path.ipynb | 130 +++--------------- .../snt_dhis2_outliers_imputation_path.r | 106 ++++++++++++++ 2 files changed, 125 insertions(+), 111 deletions(-) create mode 100644 pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r diff --git a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb index 7504401..3fb9d5e 100644 --- a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb @@ -65,14 +65,16 @@ }, "outputs": [], "source": [ - "# Project folders\n", - "ROOT_PATH <- \"~/workspace\" \n", + "# Project folders (ROOT_PATH injected by pipeline if available)\n", + "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\" \n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_path\")\n", "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", "\n", "# Load utils\n", "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_path.r\"))\n", "\n", "# Load libraries \n", "required_packages <- c(\"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\")\n", @@ -84,7 +86,7 @@ "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", "\n", "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "openhexa <- reticulate::import(\"openhexa.sdk\")" ] }, { @@ -129,7 +131,7 @@ "outputs": [], "source": [ "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", " error = function(e) {\n", " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", " log_msg(msg)\n", @@ -270,12 +272,8 @@ }, "outputs": [], "source": [ - "dhis2_routine_long <- dhis2_routine %>%\n", - " select(all_of(c(\"ADM1_ID\", \"ADM1_NAME\", \"ADM2_ID\", \"ADM2_NAME\", \"OU_ID\", \"OU_NAME\", \"PERIOD\", DHIS2_INDICATORS))) %>%\n", - " pivot_longer(cols = all_of(DHIS2_INDICATORS), names_to = \"INDICATOR\", values_to = \"VALUE\") %>%\n", - " # ⚠️ NEW: Complete missing date-indicator combinations for each facility\n", - " complete(nesting(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME), PERIOD, INDICATOR) %>%\n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"INDICATOR\", \"VALUE\")))\n", + "# Helper loaded from utils/snt_dhis2_outliers_imputation_path.r\n", + "dhis2_routine_long <- build_path_routine_long(dhis2_routine, DHIS2_INDICATORS)\n", "\n", "print(dim(dhis2_routine_long))\n", "head(dhis2_routine_long, 2)" @@ -300,17 +298,10 @@ }, "outputs": [], "source": [ - "# check if there are any duplicates\n", - "duplicated <- dhis2_routine_long %>%\n", - " group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>%\n", - " summarise(n = dplyr::n(), .groups= \"drop\") %>%\n", - " filter(n > 1L)\n", - "\n", - "# Remove dups\n", + "dedup_result <- remove_path_duplicates(dhis2_routine_long)\n", + "dhis2_routine_long <- dedup_result$data\n", + "duplicated <- dedup_result$duplicated\n", "if (nrow(duplicated) > 0) {\n", - " log_msg(glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", - " dhis2_routine_long <- dhis2_routine_long %>%\n", - " distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR, .keep_all = TRUE)\n", " head(duplicated)\n", "}" ] @@ -413,23 +404,7 @@ }, "outputs": [], "source": [ - "# high presumed cases during lower tests\n", - "low_testing_periods <- dhis2_routine_outliers %>%\n", - " filter(INDICATOR == \"TEST\") %>%\n", - " mutate(\n", - " low_testing = case_when(VALUE < MEAN_80 ~ TRUE, TRUE ~ FALSE), \n", - " # presumed may not exceed upper limits for tests \n", - " upper_limit_tested = MEAN_80 + MEAN_DEVIATION * SD_80) %>% \n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"low_testing\", \"upper_limit_tested\")))\n", - "\n", - "# decide which one could be possible stock-out periods\n", - "possible_stockout <- dhis2_routine_outliers %>%\n", - " filter(OUTLIER_TREND == TRUE) %>%\n", - " left_join(low_testing_periods, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\")) %>% \n", - " # make sure value does not exceed reasonable figures\n", - " mutate(POSSIBLE_STKOUT = case_when(low_testing == TRUE & INDICATOR == \"PRES\" & VALUE < upper_limit_tested ~ TRUE, TRUE ~ FALSE)) %>%\n", - " filter(POSSIBLE_STKOUT == TRUE) %>%\n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"POSSIBLE_STKOUT\")))" + "possible_stockout <- detect_possible_stockout(dhis2_routine_outliers, MEAN_DEVIATION)" ] }, { @@ -481,23 +456,7 @@ }, "outputs": [], "source": [ - "# ⚠️ UPDATED\n", - "possible_epidemic <- dhis2_routine_outliers %>% \n", - " filter(INDICATOR == \"TEST\" | INDICATOR == \"CONF\") %>%\n", - " rename(total = VALUE) %>% \n", - " # outlier threshold max value\n", - " mutate(max_value = MEAN_80 + MEAN_DEVIATION * SD_80) %>% \n", - " # remove columns not necessary for wider format\n", - " select(-c(\"MEAN_80\", \"SD_80\")) %>% \n", - " # wider format with two values (value and outlier-threshold max value) for each INDICATOR\n", - " pivot_wider(names_from = INDICATOR, values_from = c(total, max_value, OUTLIER_TREND)) %>% \n", - " unnest(cols = everything()) %>% \n", - " # ⚠️ NEW LOGIC: epidemic if CONF is outlier AND (TEST is outlier OR tests >= confirmed)\n", - " mutate(POSSIBLE_EPID = case_when(\n", - " OUTLIER_TREND_CONF == TRUE & (OUTLIER_TREND_TEST == TRUE | total_TEST >= total_CONF) ~ TRUE,\n", - " TRUE ~ FALSE)) %>%\n", - " filter(POSSIBLE_EPID == TRUE) %>% \n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"POSSIBLE_EPID\")))\n", + "possible_epidemic <- detect_possible_epidemic(dhis2_routine_outliers, MEAN_DEVIATION)\n", "\n", "epidemic_n <- length(unique(possible_epidemic$OU_ID))\n", "if (epidemic_n > 0) { \n", @@ -531,34 +490,11 @@ }, "outputs": [], "source": [ - "# Join columns and correct outliers column\n", - "routine_data_outliers_clean <- dhis2_routine_outliers %>% \n", - " left_join(possible_stockout, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\")) %>%\n", - " mutate(OUTLIER_TREND_01 = case_when(OUTLIER_TREND == TRUE & INDICATOR ==\"PRES\" & POSSIBLE_STKOUT == TRUE ~ FALSE, TRUE ~ OUTLIER_TREND)) %>%\n", - " left_join(possible_epidemic, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\")) %>%\n", - " mutate(OUTLIER_TREND_02 = case_when(OUTLIER_TREND_01 == TRUE & INDICATOR %in% c(\"CONF\", \"TEST\") & POSSIBLE_EPID == TRUE ~ TRUE, TRUE ~ OUTLIER_TREND_01)) %>% \n", - " select(-OUTLIER_TREND) %>%\n", - " rename(OUTLIER_TREND = OUTLIER_TREND_02) %>% \n", - " mutate(\n", - " YEAR = as.integer(substr(PERIOD, 1, 4)),\n", - " MONTH = as.integer(substr(PERIOD, 5, 6))) %>%\n", - " select(all_of(\n", - " c(\n", - " \"PERIOD\",\n", - " \"YEAR\",\n", - " \"MONTH\",\n", - " \"ADM1_ID\",\n", - " \"ADM2_ID\",\n", - " \"OU_ID\", \n", - " \"INDICATOR\",\n", - " \"VALUE\",\n", - " \"MEAN_80\",\n", - " \"SD_80\",\n", - " \"OUTLIER_TREND\",\n", - " \"POSSIBLE_STKOUT\", \n", - " \"POSSIBLE_EPID\" \n", - " )\n", - " ))\n", + "routine_data_outliers_clean <- build_path_clean_outliers(\n", + " dhis2_routine_outliers = dhis2_routine_outliers,\n", + " possible_stockout = possible_stockout,\n", + " possible_epidemic = possible_epidemic\n", + ")\n", "\n", "print(dim(routine_data_outliers_clean))\n", "head(routine_data_outliers_clean, 2)" @@ -616,35 +552,7 @@ }, "outputs": [], "source": [ - "# ⚠️ UPDATED: Added reversal check to prevent illogical corrections\n", - "# replace outliers by mean_80\n", - "routine_data_outliers_imputed <- routine_data_outliers_clean %>% \n", - " rename(VALUE_OLD = VALUE) %>% \n", - " # replace outliers with the mean 80% value\n", - " mutate(VALUE_IMPUTED = ifelse(OUTLIER_TREND == TRUE, MEAN_80, VALUE_OLD)) %>%\n", - " # ⚠️ NEW: Pivot to check test/conf relationship\n", - " select(all_of(c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"INDICATOR\", \"VALUE_OLD\", \"VALUE_IMPUTED\", \"OUTLIER_TREND\"))) %>%\n", - " pivot_wider(names_from = INDICATOR, values_from = c(VALUE_OLD, VALUE_IMPUTED, OUTLIER_TREND)) %>%\n", - " # ⚠️ NEW: Reversal check - undo corrections if they create impossible situations\n", - " # (i.e., if imputed tests < imputed conf, but original tests > original conf)\n", - " mutate(reverse_val = case_when(\n", - " !is.na(VALUE_IMPUTED_TEST) & !is.na(VALUE_IMPUTED_CONF) & \n", - " VALUE_IMPUTED_TEST < VALUE_IMPUTED_CONF & \n", - " VALUE_OLD_TEST > VALUE_OLD_CONF ~ TRUE,\n", - " TRUE ~ FALSE)) %>%\n", - " # Correct TEST if reversed\n", - " mutate(VALUE_IMPUTED_TEST = ifelse(reverse_val == TRUE, VALUE_OLD_TEST, VALUE_IMPUTED_TEST),\n", - " OUTLIER_TREND_TEST = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_TEST)) %>%\n", - " # Correct CONF if reversed\n", - " mutate(VALUE_IMPUTED_CONF = ifelse(reverse_val == TRUE, VALUE_OLD_CONF, VALUE_IMPUTED_CONF),\n", - " OUTLIER_TREND_CONF = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_CONF)) %>%\n", - " # ⚠️ Pivot back to long format\n", - " select(-reverse_val) %>%\n", - " pivot_longer(cols = starts_with(\"VALUE_OLD_\") | starts_with(\"VALUE_IMPUTED_\") | starts_with(\"OUTLIER_TREND_\"),\n", - " names_to = c(\".value\", \"INDICATOR\"),\n", - " names_pattern = \"(.*)_(.*)$\") %>%\n", - " arrange(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"INDICATOR\") %>%\n", - " select(all_of(c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"INDICATOR\", \"VALUE_OLD\", \"VALUE_IMPUTED\", \"OUTLIER_TREND\")))\n", + "routine_data_outliers_imputed <- impute_path_outliers(routine_data_outliers_clean)\n", "\n", "print(dim(routine_data_outliers_imputed))\n", "head(routine_data_outliers_imputed, 2)" diff --git a/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r new file mode 100644 index 0000000..3348940 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r @@ -0,0 +1,106 @@ +# Helpers for PATH outliers imputation notebook. + +build_path_routine_long <- function(dhis2_routine, DHIS2_INDICATORS) { + dhis2_routine %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM1_NAME", "ADM2_ID", "ADM2_NAME", "OU_ID", "OU_NAME", "PERIOD", DHIS2_INDICATORS))) %>% + tidyr::pivot_longer(cols = dplyr::all_of(DHIS2_INDICATORS), names_to = "INDICATOR", values_to = "VALUE") %>% + tidyr::complete(tidyr::nesting(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME), PERIOD, INDICATOR) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "INDICATOR", "VALUE"))) +} + +remove_path_duplicates <- function(dhis2_routine_long) { + duplicated <- dhis2_routine_long %>% + dplyr::group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>% + dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% + dplyr::filter(n > 1L) + + if (nrow(duplicated) > 0) { + log_msg(glue::glue("Removing {nrow(duplicated)} duplicated values.")) + dhis2_routine_long <- dhis2_routine_long %>% + dplyr::distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR, .keep_all = TRUE) + } + + list(data = dhis2_routine_long, duplicated = duplicated) +} + +detect_possible_stockout <- function(dhis2_routine_outliers, MEAN_DEVIATION) { + low_testing_periods <- dhis2_routine_outliers %>% + dplyr::filter(INDICATOR == "TEST") %>% + dplyr::mutate( + low_testing = dplyr::case_when(VALUE < MEAN_80 ~ TRUE, TRUE ~ FALSE), + upper_limit_tested = MEAN_80 + MEAN_DEVIATION * SD_80 + ) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "low_testing", "upper_limit_tested"))) + + dhis2_routine_outliers %>% + dplyr::filter(OUTLIER_TREND == TRUE) %>% + dplyr::left_join(low_testing_periods, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% + dplyr::mutate(POSSIBLE_STKOUT = dplyr::case_when(low_testing == TRUE & INDICATOR == "PRES" & VALUE < upper_limit_tested ~ TRUE, TRUE ~ FALSE)) %>% + dplyr::filter(POSSIBLE_STKOUT == TRUE) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "POSSIBLE_STKOUT"))) +} + +detect_possible_epidemic <- function(dhis2_routine_outliers, MEAN_DEVIATION) { + dhis2_routine_outliers %>% + dplyr::filter(INDICATOR == "TEST" | INDICATOR == "CONF") %>% + dplyr::rename(total = VALUE) %>% + dplyr::mutate(max_value = MEAN_80 + MEAN_DEVIATION * SD_80) %>% + dplyr::select(-c("MEAN_80", "SD_80")) %>% + tidyr::pivot_wider(names_from = INDICATOR, values_from = c(total, max_value, OUTLIER_TREND)) %>% + tidyr::unnest(cols = dplyr::everything()) %>% + dplyr::mutate(POSSIBLE_EPID = dplyr::case_when( + OUTLIER_TREND_CONF == TRUE & (OUTLIER_TREND_TEST == TRUE | total_TEST >= total_CONF) ~ TRUE, + TRUE ~ FALSE + )) %>% + dplyr::filter(POSSIBLE_EPID == TRUE) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "POSSIBLE_EPID"))) +} + +build_path_clean_outliers <- function(dhis2_routine_outliers, possible_stockout, possible_epidemic) { + dhis2_routine_outliers %>% + dplyr::left_join(possible_stockout, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% + dplyr::mutate(OUTLIER_TREND_01 = dplyr::case_when(OUTLIER_TREND == TRUE & INDICATOR == "PRES" & POSSIBLE_STKOUT == TRUE ~ FALSE, TRUE ~ OUTLIER_TREND)) %>% + dplyr::left_join(possible_epidemic, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% + dplyr::mutate(OUTLIER_TREND_02 = dplyr::case_when(OUTLIER_TREND_01 == TRUE & INDICATOR %in% c("CONF", "TEST") & POSSIBLE_EPID == TRUE ~ TRUE, TRUE ~ OUTLIER_TREND_01)) %>% + dplyr::select(-OUTLIER_TREND) %>% + dplyr::rename(OUTLIER_TREND = OUTLIER_TREND_02) %>% + dplyr::mutate( + YEAR = as.integer(substr(PERIOD, 1, 4)), + MONTH = as.integer(substr(PERIOD, 5, 6)) + ) %>% + dplyr::select(dplyr::all_of(c( + "PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID", + "INDICATOR", "VALUE", "MEAN_80", "SD_80", + "OUTLIER_TREND", "POSSIBLE_STKOUT", "POSSIBLE_EPID" + ))) +} + +impute_path_outliers <- function(routine_data_outliers_clean) { + routine_data_outliers_clean %>% + dplyr::rename(VALUE_OLD = VALUE) %>% + dplyr::mutate(VALUE_IMPUTED = ifelse(OUTLIER_TREND == TRUE, MEAN_80, VALUE_OLD)) %>% + dplyr::select(dplyr::all_of(c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID", "INDICATOR", "VALUE_OLD", "VALUE_IMPUTED", "OUTLIER_TREND"))) %>% + tidyr::pivot_wider(names_from = INDICATOR, values_from = c(VALUE_OLD, VALUE_IMPUTED, OUTLIER_TREND)) %>% + dplyr::mutate(reverse_val = dplyr::case_when( + !is.na(VALUE_IMPUTED_TEST) & !is.na(VALUE_IMPUTED_CONF) & + VALUE_IMPUTED_TEST < VALUE_IMPUTED_CONF & + VALUE_OLD_TEST > VALUE_OLD_CONF ~ TRUE, + TRUE ~ FALSE + )) %>% + dplyr::mutate( + VALUE_IMPUTED_TEST = ifelse(reverse_val == TRUE, VALUE_OLD_TEST, VALUE_IMPUTED_TEST), + OUTLIER_TREND_TEST = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_TEST) + ) %>% + dplyr::mutate( + VALUE_IMPUTED_CONF = ifelse(reverse_val == TRUE, VALUE_OLD_CONF, VALUE_IMPUTED_CONF), + OUTLIER_TREND_CONF = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_CONF) + ) %>% + dplyr::select(-reverse_val) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("VALUE_OLD_") | dplyr::starts_with("VALUE_IMPUTED_") | dplyr::starts_with("OUTLIER_TREND_"), + names_to = c(".value", "INDICATOR"), + names_pattern = "(.*)_(.*)$" + ) %>% + dplyr::arrange(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>% + dplyr::select(dplyr::all_of(c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID", "INDICATOR", "VALUE_OLD", "VALUE_IMPUTED", "OUTLIER_TREND"))) +} From e3404c85199ac07b8adc136154c095b3afaa1a95 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 27 Mar 2026 15:07:51 +0100 Subject: [PATCH 14/17] Refactor magic glasses input preparation and outlier detection functions for improved modularity and clarity. Introduced `prepare_magic_glasses_input` to streamline input handling and context setup, while enhancing logging and error management. Updated notebook to utilize new function for better organization. --- ...s2_outliers_imputation_magic_glasses.ipynb | 288 +++-------------- ..._dhis2_outliers_imputation_magic_glasses.r | 296 ++++++++++++++++++ 2 files changed, 334 insertions(+), 250 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index be13bd5..ce4fdc2 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -66,101 +66,27 @@ "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_magic_glasses.r\"))\n", "\n", - "required_packages <- c(\"arrow\", \"data.table\", \"jsonlite\", \"reticulate\", \"glue\")\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " required_packages <- c(required_packages, \"forecast\")\n", - "}\n", - "if (RUN_MAGIC_GLASSES_COMPLETE && SEASONAL_WORKERS > 1) {\n", - " required_packages <- c(required_packages, \"future\", \"future.apply\")\n", - "}\n", - "\n", - "setup_ctx <- bootstrap_magic_glasses_context(\n", + "mg_input <- prepare_magic_glasses_input(\n", " root_path = ROOT_PATH,\n", - " required_packages = required_packages\n", + " config_file_name = CONFIG_FILE_NAME,\n", + " run_complete = RUN_MAGIC_GLASSES_COMPLETE,\n", + " seasonal_workers = SEASONAL_WORKERS,\n", + " dev_subset = DEV_SUBSET,\n", + " dev_subset_adm1_n = DEV_SUBSET_ADM1_N\n", ")\n", "\n", + "setup_ctx <- mg_input$setup_ctx\n", + "config_json <- mg_input$config_json\n", + "COUNTRY_CODE <- mg_input$country_code\n", + "fixed_cols <- mg_input$fixed_cols\n", + "indicators_to_keep <- mg_input$indicators_to_keep\n", + "dhis2_routine <- mg_input$dhis2_routine\n", + "dhis2_routine_long <- mg_input$dhis2_routine_long\n", + "\n", "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " log_msg(\"[WARNING] Complete mode: seasonal detection is very computationally intensive and can take several hours to run.\", \"warning\")\n", - "}\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE && SEASONAL_WORKERS > 1) {\n", - " future::plan(future::multisession, workers = SEASONAL_WORKERS)\n", - " log_msg(glue::glue(\"Using parallel seasonal detection with {SEASONAL_WORKERS} workers\"))\n", - "}\n", - "\n", - "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME))\n", - "\n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\")\n", - "for (conf in snt_config_mandatory) {\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "fixed_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\")\n", - "indicators_to_keep <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue::glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", - "log_msg(glue::glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", - "log_msg(glue::glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", - "\n", - "if (all(c(\"YEAR\", \"MONTH\") %in% names(dhis2_routine))) {\n", - " dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)\n", - "}\n", - "\n", - "cols_to_select <- intersect(c(fixed_cols, indicators_to_keep), names(dhis2_routine))\n", - "dt_routine <- as.data.table(dhis2_routine)[, ..cols_to_select]\n", - "\n", - "dhis2_routine_long <- melt(\n", - " dt_routine,\n", - " id.vars = intersect(fixed_cols, names(dt_routine)),\n", - " measure.vars = intersect(indicators_to_keep, names(dt_routine)),\n", - " variable.name = \"INDICATOR\",\n", - " value.name = \"VALUE\",\n", - " variable.factor = FALSE\n", - ")\n", - "\n", - "# Remove duplicated values (same strategy as mean/median)\n", - "dup_keys <- c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"YEAR\", \"MONTH\", \"INDICATOR\")\n", - "dup_keys <- intersect(dup_keys, names(dhis2_routine_long))\n", - "if (length(dup_keys) > 0) {\n", - " duplicated <- dhis2_routine_long[, .N, by = dup_keys][N > 1L]\n", - " if (nrow(duplicated) > 0) {\n", - " log_msg(glue::glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", - " data.table::setkeyv(dhis2_routine_long, dup_keys)\n", - " dhis2_routine_long <- unique(dhis2_routine_long)\n", - " }\n", - "}\n", - "\n", - "if (DEV_SUBSET) {\n", - " unique_adm1 <- unique(dhis2_routine_long$ADM1_ID)\n", - " adm1_to_keep <- unique_adm1[seq_len(min(DEV_SUBSET_ADM1_N, length(unique_adm1)))]\n", - " dhis2_routine_long <- dhis2_routine_long[ADM1_ID %in% adm1_to_keep]\n", - " log_msg(glue::glue(\"DEV_SUBSET enabled: keeping {length(adm1_to_keep)} ADM1 values\"), \"warning\")\n", - "}\n", - "\n", - "log_msg(glue::glue(\"Data loaded: {nrow(dhis2_routine_long)} rows, {length(unique(dhis2_routine_long$OU_ID))} facilities\"))\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " n_groups <- uniqueN(dhis2_routine_long[, .(OU_ID, INDICATOR)])\n", - " log_msg(glue::glue(\"Complete mode active: seasonal detection will run on up to {n_groups} OU_ID x INDICATOR time series.\"), \"warning\")\n", - "} else {\n", - " log_msg(\"Partial mode active: seasonal detection is skipped.\")\n", - "}" + "openhexa <- setup_ctx$openhexa" ] }, { @@ -175,8 +101,9 @@ "outputs": [], "source": [ "# Helpers loaded from utils/snt_dhis2_outliers_imputation_magic_glasses.r\n", - "# - detect_outliers_mad_custom()\n", - "# - detect_seasonal_outliers()" + "# - prepare_magic_glasses_input()\n", + "# - run_magic_glasses_outlier_detection()\n", + "# - export_magic_glasses_outputs()" ] }, { @@ -190,72 +117,18 @@ }, "outputs": [], "source": [ - "if (RUN_MAGIC_GLASSES_PARTIAL | RUN_MAGIC_GLASSES_COMPLETE) {\n", - " log_msg(\"Starting MAD15 detection...\")\n", - " flagged_outliers_mad15 <- detect_outliers_mad_custom(dhis2_routine_long, DEVIATION_MAD15)\n", - " flagged_outliers_mad15_filtered <- flagged_outliers_mad15[OUTLIER_MAD15 == FALSE]\n", - "\n", - " log_msg(\"Starting MAD10 detection...\")\n", - " flagged_outliers_mad10 <- detect_outliers_mad_custom(flagged_outliers_mad15_filtered, DEVIATION_MAD10)\n", - " setnames(flagged_outliers_mad10, paste0(\"OUTLIER_MAD\", DEVIATION_MAD10), \"OUTLIER_MAD15_MAD10\")\n", - "\n", - " join_cols <- c(\"PERIOD\", \"OU_ID\", \"INDICATOR\")\n", - " mad10_subset <- flagged_outliers_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)]\n", - " flagged_outliers_mad15_mad10 <- merge(\n", - " flagged_outliers_mad15,\n", - " mad10_subset,\n", - " by = join_cols,\n", - " all.x = TRUE\n", - " )\n", - " flagged_outliers_mad15_mad10[is.na(OUTLIER_MAD15_MAD10), OUTLIER_MAD15_MAD10 := TRUE]\n", - " log_msg(glue::glue(\"MAD partial done: {sum(flagged_outliers_mad15_mad10$OUTLIER_MAD15_MAD10)} outliers flagged\"))\n", - "}\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " flagged_outliers_mad15_mad10_filtered <- flagged_outliers_mad15_mad10[OUTLIER_MAD15_MAD10 == FALSE]\n", - "\n", - " if (nrow(flagged_outliers_mad15_mad10_filtered) == 0) {\n", - " log_msg(\"No rows left after MAD partial filtering; seasonal step will be skipped.\", \"warning\")\n", - " flagged_outliers_seasonal5 <- copy(flagged_outliers_mad15_mad10_filtered)\n", - " flagged_outliers_seasonal5[, OUTLIER_SEASONAL5 := FALSE]\n", - " flagged_outliers_seasonal5_filtered <- flagged_outliers_seasonal5\n", - " flagged_outliers_seasonal3 <- copy(flagged_outliers_seasonal5_filtered)\n", - " flagged_outliers_seasonal3[, OUTLIER_SEASONAL3 := FALSE]\n", - " } else {\n", - " log_msg(glue::glue(\"Starting SEASONAL5 detection on {nrow(flagged_outliers_mad15_mad10_filtered)} rows...\"))\n", - " t_seasonal5 <- system.time({\n", - " flagged_outliers_seasonal5 <- detect_seasonal_outliers(\n", - " flagged_outliers_mad15_mad10_filtered,\n", - " deviation = DEVIATION_SEASONAL5,\n", - " workers = SEASONAL_WORKERS\n", - " )\n", - " })\n", - " flagged_outliers_seasonal5_filtered <- flagged_outliers_seasonal5[OUTLIER_SEASONAL5 == FALSE]\n", - " log_msg(glue::glue(\"SEASONAL5 finished in {round(t_seasonal5['elapsed'], 1)}s. Remaining rows: {nrow(flagged_outliers_seasonal5_filtered)}\"))\n", - "\n", - " log_msg(glue::glue(\"Starting SEASONAL3 detection on {nrow(flagged_outliers_seasonal5_filtered)} rows...\"))\n", - " t_seasonal3 <- system.time({\n", - " flagged_outliers_seasonal3 <- detect_seasonal_outliers(\n", - " flagged_outliers_seasonal5_filtered,\n", - " deviation = DEVIATION_SEASONAL3,\n", - " workers = SEASONAL_WORKERS\n", - " )\n", - " })\n", - " log_msg(glue::glue(\"SEASONAL3 finished in {round(t_seasonal3['elapsed'], 1)}s.\"))\n", - " }\n", - "\n", - " setnames(flagged_outliers_seasonal3, paste0(\"OUTLIER_SEASONAL\", DEVIATION_SEASONAL3), \"OUTLIER_SEASONAL5_SEASONAL3\")\n", + "detection_result <- run_magic_glasses_outlier_detection(\n", + " dhis2_routine_long = dhis2_routine_long,\n", + " deviation_mad15 = DEVIATION_MAD15,\n", + " deviation_mad10 = DEVIATION_MAD10,\n", + " run_complete = RUN_MAGIC_GLASSES_COMPLETE,\n", + " deviation_seasonal5 = DEVIATION_SEASONAL5,\n", + " deviation_seasonal3 = DEVIATION_SEASONAL3,\n", + " seasonal_workers = SEASONAL_WORKERS\n", + ")\n", "\n", - " seasonal3_subset <- flagged_outliers_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)]\n", - " flagged_outliers_seasonal5_seasonal3 <- merge(\n", - " flagged_outliers_seasonal5,\n", - " seasonal3_subset,\n", - " by = join_cols,\n", - " all.x = TRUE\n", - " )\n", - " flagged_outliers_seasonal5_seasonal3[is.na(OUTLIER_SEASONAL5_SEASONAL3), OUTLIER_SEASONAL5_SEASONAL3 := TRUE]\n", - " log_msg(glue::glue(\"SEASONAL complete done: {sum(flagged_outliers_seasonal5_seasonal3$OUTLIER_SEASONAL5_SEASONAL3)} outliers flagged\"))\n", - "}" + "flagged_outliers_mad15_mad10 <- detection_result$flagged_outliers_mad15_mad10\n", + "flagged_outliers_seasonal5_seasonal3 <- detection_result$flagged_outliers_seasonal5_seasonal3" ] }, { @@ -269,102 +142,17 @@ }, "outputs": [], "source": [ - "base_cols <- intersect(c(fixed_cols, \"INDICATOR\", \"VALUE\"), names(dhis2_routine_long))\n", - "flagged_outliers_mg <- copy(dhis2_routine_long[, ..base_cols])\n", - "join_cols <- c(\"PERIOD\", \"OU_ID\", \"INDICATOR\")\n", - "\n", - "if (RUN_MAGIC_GLASSES_PARTIAL | RUN_MAGIC_GLASSES_COMPLETE) {\n", - " partial_subset <- flagged_outliers_mad15_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)]\n", - " flagged_outliers_mg <- merge(flagged_outliers_mg, partial_subset, by = join_cols, all.x = TRUE)\n", - " setnames(flagged_outliers_mg, \"OUTLIER_MAD15_MAD10\", \"OUTLIER_MAGIC_GLASSES_PARTIAL\")\n", - "}\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " complete_subset <- flagged_outliers_seasonal5_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)]\n", - " flagged_outliers_mg <- merge(flagged_outliers_mg, complete_subset, by = join_cols, all.x = TRUE)\n", - " setnames(flagged_outliers_mg, \"OUTLIER_SEASONAL5_SEASONAL3\", \"OUTLIER_MAGIC_GLASSES_COMPLETE\")\n", - " flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE) & OUTLIER_MAGIC_GLASSES_PARTIAL == TRUE,\n", - " OUTLIER_MAGIC_GLASSES_COMPLETE := TRUE]\n", - "}\n", - "\n", - "if (\"OUTLIER_MAGIC_GLASSES_PARTIAL\" %in% names(flagged_outliers_mg)) {\n", - " flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_PARTIAL), OUTLIER_MAGIC_GLASSES_PARTIAL := FALSE]\n", - "}\n", - "if (\"OUTLIER_MAGIC_GLASSES_COMPLETE\" %in% names(flagged_outliers_mg)) {\n", - " flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE), OUTLIER_MAGIC_GLASSES_COMPLETE := FALSE]\n", - "}\n", - "\n", - "active_outlier_col <- if (\n", - " RUN_MAGIC_GLASSES_COMPLETE && \"OUTLIER_MAGIC_GLASSES_COMPLETE\" %in% names(flagged_outliers_mg)\n", - ") {\n", - " \"OUTLIER_MAGIC_GLASSES_COMPLETE\"\n", - "} else {\n", - " \"OUTLIER_MAGIC_GLASSES_PARTIAL\"\n", - "}\n", - "\n", - "if (!(active_outlier_col %in% names(flagged_outliers_mg))) {\n", - " stop(glue::glue(\"Expected outlier flag column not found: {active_outlier_col}\"))\n", - "}\n", - "\n", - "pyramid_names <- unique(as.data.table(dhis2_routine)[, .(\n", - " ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, OU_ID, OU_NAME\n", - ")])\n", - "\n", - "# 1) Detected table: full routine data + OUTLIER_DETECTED flag (same structure as mean/median/iqr/path)\n", - "outlier_method_label <- if (active_outlier_col == \"OUTLIER_MAGIC_GLASSES_COMPLETE\") \"MAGIC_GLASSES_COMPLETE\" else \"MAGIC_GLASSES_PARTIAL\"\n", - "detected_tbl <- flagged_outliers_mg[, .(\n", - " PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE,\n", - " OUTLIER_DETECTED = get(active_outlier_col),\n", - " OUTLIER_METHOD = outlier_method_label\n", - ")]\n", - "detected_tbl[is.na(OUTLIER_DETECTED), OUTLIER_DETECTED := FALSE]\n", - "detected_tbl <- merge(detected_tbl, unique(pyramid_names), by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"), all.x = TRUE)\n", - "detected_tbl[, DATE := as.Date(sprintf(\"%04d-%02d-01\", YEAR, MONTH))]\n", - "arrow::write_parquet(detected_tbl, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_detected.parquet\")))\n", - "n_out <- sum(detected_tbl$OUTLIER_DETECTED == TRUE)\n", - "log_msg(glue::glue(\"Exported full detection table ({nrow(detected_tbl)} rows, {n_out} outliers) to {COUNTRY_CODE}_routine_outliers_detected.parquet\"))\n", - "\n", - "# 2) Imputed routine data (same moving-average logic as other outlier pipelines)\n", - "imputed_long <- copy(flagged_outliers_mg)\n", - "setorder(imputed_long, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH)\n", - "imputed_long[, TO_IMPUTE := fifelse(get(active_outlier_col) == TRUE, NA_real_, VALUE)]\n", - "imputed_long[\n", - " , MOVING_AVG := frollapply(\n", - " TO_IMPUTE,\n", - " n = 3,\n", - " FUN = function(x) ceiling(mean(x, na.rm = TRUE)),\n", - " align = \"center\"\n", - " ),\n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)\n", - "]\n", - "imputed_long[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)]\n", - "imputed_long[, VALUE := VALUE_IMPUTED]\n", - "imputed_long[, c(\"TO_IMPUTE\", \"MOVING_AVG\", \"VALUE_IMPUTED\") := NULL]\n", - "\n", - "routine_imputed <- to_routine_wide(\n", - " dt_long = imputed_long,\n", + "export_magic_glasses_outputs(\n", + " dhis2_routine_long = dhis2_routine_long,\n", + " flagged_outliers_mad15_mad10 = flagged_outliers_mad15_mad10,\n", + " flagged_outliers_seasonal5_seasonal3 = flagged_outliers_seasonal5_seasonal3,\n", + " run_complete = RUN_MAGIC_GLASSES_COMPLETE,\n", + " dhis2_routine = dhis2_routine,\n", " fixed_cols = fixed_cols,\n", " indicators_to_keep = indicators_to_keep,\n", - " pyramid_names = pyramid_names\n", - ")\n", - "arrow::write_parquet(routine_imputed, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_imputed.parquet\")))\n", - "log_msg(glue::glue(\"Exported routine imputed table to {COUNTRY_CODE}_routine_outliers_imputed.parquet\"))\n", - "\n", - "# 3) Removed routine data\n", - "# We set outlier values to NA (we do not remove rows). The routine data keeps the same structure.\n", - "removed_long <- copy(flagged_outliers_mg)\n", - "removed_long[get(active_outlier_col) == TRUE, VALUE := NA_real_]\n", - "\n", - "routine_removed <- to_routine_wide(\n", - " dt_long = removed_long,\n", - " fixed_cols = fixed_cols,\n", - " indicators_to_keep = indicators_to_keep,\n", - " pyramid_names = pyramid_names\n", - ")\n", - "arrow::write_parquet(routine_removed, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_removed.parquet\")))\n", - "log_msg(glue::glue(\"Exported routine removed table to {COUNTRY_CODE}_routine_outliers_removed.parquet\"))\n", - "\n", - "log_msg(\"MG outlier tables exported successfully.\")" + " output_dir = OUTPUT_DIR,\n", + " country_code = COUNTRY_CODE\n", + ")" ] } ], diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r index 28ec619..8cd4bb4 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r @@ -17,6 +17,9 @@ bootstrap_magic_glasses_context <- function( if (load_openhexa) { openhexa <- reticulate::import("openhexa.sdk") } + # snt_utils::log_msg() relies on a global `openhexa` object. + # Expose it before any helper function logs messages. + assign("openhexa", openhexa, envir = .GlobalEnv) return(list( ROOT_PATH = root_path, @@ -125,3 +128,296 @@ to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_nam routine_wide } +prepare_magic_glasses_input <- function( + root_path, + config_file_name = "SNT_config.json", + run_complete = FALSE, + seasonal_workers = 1, + dev_subset = FALSE, + dev_subset_adm1_n = 2 +) { + required_packages <- c("arrow", "data.table", "jsonlite", "reticulate", "glue") + if (run_complete) { + required_packages <- c(required_packages, "forecast") + } + if (run_complete && seasonal_workers > 1) { + required_packages <- c(required_packages, "future", "future.apply") + } + + setup_ctx <- bootstrap_magic_glasses_context( + root_path = root_path, + required_packages = required_packages + ) + + if (run_complete) { + log_msg("[WARNING] Complete mode: seasonal detection is very computationally intensive and can take several hours to run.", "warning") + } + + if (run_complete && seasonal_workers > 1) { + future::plan(future::multisession, workers = seasonal_workers) + log_msg(glue::glue("Using parallel seasonal detection with {seasonal_workers} workers")) + } + + config_json <- jsonlite::fromJSON(file.path(setup_ctx$CONFIG_PATH, config_file_name)) + + snt_config_mandatory <- c("COUNTRY_CODE", "DHIS2_ADMINISTRATION_1", "DHIS2_ADMINISTRATION_2") + for (conf in snt_config_mandatory) { + if (is.null(config_json$SNT_CONFIG[[conf]])) { + msg <- paste("Missing configuration input:", conf) + log_msg(msg) + stop(msg) + } + } + + country_code <- config_json$SNT_CONFIG$COUNTRY_CODE + fixed_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID") + indicators_to_keep <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) + + dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (all(c("YEAR", "MONTH") %in% names(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + cols_to_select <- intersect(c(fixed_cols, indicators_to_keep), names(dhis2_routine)) + dt_routine <- data.table::as.data.table(dhis2_routine)[, ..cols_to_select] + + dhis2_routine_long <- data.table::melt( + dt_routine, + id.vars = intersect(fixed_cols, names(dt_routine)), + measure.vars = intersect(indicators_to_keep, names(dt_routine)), + variable.name = "INDICATOR", + value.name = "VALUE", + variable.factor = FALSE + ) + + dup_keys <- c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "YEAR", "MONTH", "INDICATOR") + dup_keys <- intersect(dup_keys, names(dhis2_routine_long)) + if (length(dup_keys) > 0) { + duplicated <- dhis2_routine_long[, .N, by = dup_keys][N > 1L] + if (nrow(duplicated) > 0) { + log_msg(glue::glue("Removing {nrow(duplicated)} duplicated values.")) + data.table::setkeyv(dhis2_routine_long, dup_keys) + dhis2_routine_long <- unique(dhis2_routine_long) + } + } + + if (dev_subset) { + unique_adm1 <- unique(dhis2_routine_long$ADM1_ID) + adm1_to_keep <- unique_adm1[seq_len(min(dev_subset_adm1_n, length(unique_adm1)))] + dhis2_routine_long <- dhis2_routine_long[ADM1_ID %in% adm1_to_keep] + log_msg(glue::glue("DEV_SUBSET enabled: keeping {length(adm1_to_keep)} ADM1 values"), "warning") + } + + log_msg(glue::glue("Data loaded: {nrow(dhis2_routine_long)} rows, {length(unique(dhis2_routine_long$OU_ID))} facilities")) + + if (run_complete) { + n_groups <- data.table::uniqueN(dhis2_routine_long[, .(OU_ID, INDICATOR)]) + log_msg(glue::glue("Complete mode active: seasonal detection will run on up to {n_groups} OU_ID x INDICATOR time series."), "warning") + } else { + log_msg("Partial mode active: seasonal detection is skipped.") + } + + list( + setup_ctx = setup_ctx, + config_json = config_json, + country_code = country_code, + fixed_cols = fixed_cols, + indicators_to_keep = indicators_to_keep, + dhis2_routine = dhis2_routine, + dhis2_routine_long = dhis2_routine_long + ) +} + +run_magic_glasses_outlier_detection <- function( + dhis2_routine_long, + deviation_mad15 = 15, + deviation_mad10 = 10, + run_complete = FALSE, + deviation_seasonal5 = 5, + deviation_seasonal3 = 3, + seasonal_workers = 1 +) { + log_msg("Starting MAD15 detection...") + flagged_outliers_mad15 <- detect_outliers_mad_custom(dhis2_routine_long, deviation_mad15) + flagged_outliers_mad15_filtered <- flagged_outliers_mad15[OUTLIER_MAD15 == FALSE] + + log_msg("Starting MAD10 detection...") + flagged_outliers_mad10 <- detect_outliers_mad_custom(flagged_outliers_mad15_filtered, deviation_mad10) + data.table::setnames(flagged_outliers_mad10, paste0("OUTLIER_MAD", deviation_mad10), "OUTLIER_MAD15_MAD10") + + join_cols <- c("PERIOD", "OU_ID", "INDICATOR") + mad10_subset <- flagged_outliers_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)] + flagged_outliers_mad15_mad10 <- merge( + flagged_outliers_mad15, + mad10_subset, + by = join_cols, + all.x = TRUE + ) + flagged_outliers_mad15_mad10[is.na(OUTLIER_MAD15_MAD10), OUTLIER_MAD15_MAD10 := TRUE] + log_msg(glue::glue("MAD partial done: {sum(flagged_outliers_mad15_mad10$OUTLIER_MAD15_MAD10)} outliers flagged")) + + flagged_outliers_seasonal5_seasonal3 <- NULL + if (run_complete) { + flagged_outliers_mad15_mad10_filtered <- flagged_outliers_mad15_mad10[OUTLIER_MAD15_MAD10 == FALSE] + + if (nrow(flagged_outliers_mad15_mad10_filtered) == 0) { + log_msg("No rows left after MAD partial filtering; seasonal step will be skipped.", "warning") + flagged_outliers_seasonal5 <- data.table::copy(flagged_outliers_mad15_mad10_filtered) + flagged_outliers_seasonal5[, OUTLIER_SEASONAL5 := FALSE] + flagged_outliers_seasonal3 <- data.table::copy(flagged_outliers_seasonal5) + flagged_outliers_seasonal3[, OUTLIER_SEASONAL3 := FALSE] + } else { + log_msg(glue::glue("Starting SEASONAL5 detection on {nrow(flagged_outliers_mad15_mad10_filtered)} rows...")) + t_seasonal5 <- system.time({ + flagged_outliers_seasonal5 <- detect_seasonal_outliers( + flagged_outliers_mad15_mad10_filtered, + deviation = deviation_seasonal5, + workers = seasonal_workers + ) + }) + flagged_outliers_seasonal5_filtered <- flagged_outliers_seasonal5[OUTLIER_SEASONAL5 == FALSE] + log_msg(glue::glue("SEASONAL5 finished in {round(t_seasonal5['elapsed'], 1)}s. Remaining rows: {nrow(flagged_outliers_seasonal5_filtered)}")) + + log_msg(glue::glue("Starting SEASONAL3 detection on {nrow(flagged_outliers_seasonal5_filtered)} rows...")) + t_seasonal3 <- system.time({ + flagged_outliers_seasonal3 <- detect_seasonal_outliers( + flagged_outliers_seasonal5_filtered, + deviation = deviation_seasonal3, + workers = seasonal_workers + ) + }) + log_msg(glue::glue("SEASONAL3 finished in {round(t_seasonal3['elapsed'], 1)}s.")) + } + + data.table::setnames(flagged_outliers_seasonal3, paste0("OUTLIER_SEASONAL", deviation_seasonal3), "OUTLIER_SEASONAL5_SEASONAL3") + + seasonal3_subset <- flagged_outliers_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)] + flagged_outliers_seasonal5_seasonal3 <- merge( + flagged_outliers_seasonal5, + seasonal3_subset, + by = join_cols, + all.x = TRUE + ) + flagged_outliers_seasonal5_seasonal3[is.na(OUTLIER_SEASONAL5_SEASONAL3), OUTLIER_SEASONAL5_SEASONAL3 := TRUE] + log_msg(glue::glue("SEASONAL complete done: {sum(flagged_outliers_seasonal5_seasonal3$OUTLIER_SEASONAL5_SEASONAL3)} outliers flagged")) + } + + list( + flagged_outliers_mad15_mad10 = flagged_outliers_mad15_mad10, + flagged_outliers_seasonal5_seasonal3 = flagged_outliers_seasonal5_seasonal3 + ) +} + +export_magic_glasses_outputs <- function( + dhis2_routine_long, + flagged_outliers_mad15_mad10, + flagged_outliers_seasonal5_seasonal3, + run_complete, + dhis2_routine, + fixed_cols, + indicators_to_keep, + output_dir, + country_code +) { + base_cols <- intersect(c(fixed_cols, "INDICATOR", "VALUE"), names(dhis2_routine_long)) + flagged_outliers_mg <- data.table::copy(dhis2_routine_long[, ..base_cols]) + join_cols <- c("PERIOD", "OU_ID", "INDICATOR") + + partial_subset <- flagged_outliers_mad15_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)] + flagged_outliers_mg <- merge(flagged_outliers_mg, partial_subset, by = join_cols, all.x = TRUE) + data.table::setnames(flagged_outliers_mg, "OUTLIER_MAD15_MAD10", "OUTLIER_MAGIC_GLASSES_PARTIAL") + + if (run_complete && !is.null(flagged_outliers_seasonal5_seasonal3)) { + complete_subset <- flagged_outliers_seasonal5_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)] + flagged_outliers_mg <- merge(flagged_outliers_mg, complete_subset, by = join_cols, all.x = TRUE) + data.table::setnames(flagged_outliers_mg, "OUTLIER_SEASONAL5_SEASONAL3", "OUTLIER_MAGIC_GLASSES_COMPLETE") + flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE) & OUTLIER_MAGIC_GLASSES_PARTIAL == TRUE, OUTLIER_MAGIC_GLASSES_COMPLETE := TRUE] + } + + flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_PARTIAL), OUTLIER_MAGIC_GLASSES_PARTIAL := FALSE] + if ("OUTLIER_MAGIC_GLASSES_COMPLETE" %in% names(flagged_outliers_mg)) { + flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE), OUTLIER_MAGIC_GLASSES_COMPLETE := FALSE] + } + + active_outlier_col <- if (run_complete && "OUTLIER_MAGIC_GLASSES_COMPLETE" %in% names(flagged_outliers_mg)) { + "OUTLIER_MAGIC_GLASSES_COMPLETE" + } else { + "OUTLIER_MAGIC_GLASSES_PARTIAL" + } + + if (!(active_outlier_col %in% names(flagged_outliers_mg))) { + stop(glue::glue("Expected outlier flag column not found: {active_outlier_col}")) + } + + pyramid_names <- unique(data.table::as.data.table(dhis2_routine)[, .( + ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, OU_ID, OU_NAME + )]) + + outlier_method_label <- if (active_outlier_col == "OUTLIER_MAGIC_GLASSES_COMPLETE") "MAGIC_GLASSES_COMPLETE" else "MAGIC_GLASSES_PARTIAL" + detected_tbl <- flagged_outliers_mg[, .( + PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE, + OUTLIER_DETECTED = get(active_outlier_col), + OUTLIER_METHOD = outlier_method_label + )] + detected_tbl[is.na(OUTLIER_DETECTED), OUTLIER_DETECTED := FALSE] + detected_tbl <- merge(detected_tbl, unique(pyramid_names), by = c("ADM1_ID", "ADM2_ID", "OU_ID"), all.x = TRUE) + detected_tbl[, DATE := as.Date(sprintf("%04d-%02d-01", YEAR, MONTH))] + arrow::write_parquet(detected_tbl, file.path(output_dir, paste0(country_code, "_routine_outliers_detected.parquet"))) + n_out <- sum(detected_tbl$OUTLIER_DETECTED == TRUE) + log_msg(glue::glue("Exported full detection table ({nrow(detected_tbl)} rows, {n_out} outliers) to {country_code}_routine_outliers_detected.parquet")) + + imputed_long <- data.table::copy(flagged_outliers_mg) + data.table::setorder(imputed_long, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + imputed_long[, TO_IMPUTE := data.table::fifelse(get(active_outlier_col) == TRUE, NA_real_, VALUE)] + imputed_long[ + , + MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), + by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR) + ] + imputed_long[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + imputed_long[, VALUE := VALUE_IMPUTED] + imputed_long[, c("TO_IMPUTE", "MOVING_AVG", "VALUE_IMPUTED") := NULL] + + routine_imputed <- to_routine_wide( + dt_long = imputed_long, + fixed_cols = fixed_cols, + indicators_to_keep = indicators_to_keep, + pyramid_names = pyramid_names + ) + arrow::write_parquet(routine_imputed, file.path(output_dir, paste0(country_code, "_routine_outliers_imputed.parquet"))) + log_msg(glue::glue("Exported routine imputed table to {country_code}_routine_outliers_imputed.parquet")) + + removed_long <- data.table::copy(flagged_outliers_mg) + removed_long[get(active_outlier_col) == TRUE, VALUE := NA_real_] + + routine_removed <- to_routine_wide( + dt_long = removed_long, + fixed_cols = fixed_cols, + indicators_to_keep = indicators_to_keep, + pyramid_names = pyramid_names + ) + arrow::write_parquet(routine_removed, file.path(output_dir, paste0(country_code, "_routine_outliers_removed.parquet"))) + log_msg(glue::glue("Exported routine removed table to {country_code}_routine_outliers_removed.parquet")) + + log_msg("MG outlier tables exported successfully.") + invisible(list(active_outlier_col = active_outlier_col)) +} + From 70427617f8239227dd68251e51c692c6cf65c91a Mon Sep 17 00:00:00 2001 From: claude-marie Date: Wed, 1 Apr 2026 16:56:48 +0200 Subject: [PATCH 15/17] big cleaning --- .../snt_dhis2_outliers_imputation_iqr.ipynb | 9 +- ...dhis2_outliers_imputation_iqr_report.ipynb | 226 +++++++++--------- .../utils/snt_dhis2_outliers_imputation_iqr.r | 3 + ...s2_outliers_imputation_magic_glasses.ipynb | 17 +- ...iers_imputation_magic_glasses_report.ipynb | 8 +- ..._dhis2_outliers_imputation_magic_glasses.r | 3 + .../snt_dhis2_outliers_imputation_mean.ipynb | 9 +- ...his2_outliers_imputation_mean_report.ipynb | 6 +- .../snt_dhis2_outliers_imputation_mean.r | 3 + ...snt_dhis2_outliers_imputation_median.ipynb | 9 +- ...s2_outliers_imputation_median_report.ipynb | 6 +- .../snt_dhis2_outliers_imputation_median.r | 3 + .../snt_dhis2_outliers_imputation_path.ipynb | 27 +-- .../snt_dhis2_outliers_imputation_path.r | 34 +++ snt_dhis2_outliers_imputation_iqr/pipeline.py | 1 - .../pipeline.py | 7 +- .../pipeline.py | 1 - .../pipeline.py | 1 - .../pipeline.py | 1 - 19 files changed, 182 insertions(+), 192 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb index 1731e12..ae8d33a 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb @@ -68,10 +68,7 @@ " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", ")\n", "\n", - "CODE_PATH <- setup_ctx$CODE_PATH\n", - "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa" + "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, { @@ -117,7 +114,7 @@ "source": [ "# Load SNT config from bootstrap context\n", "config_json <- setup_ctx$config_json\n", - "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" + "log_msg(glue(\"SNT configuration loaded from : {file.path(setup_ctx$CONFIG_PATH, 'SNT_config.json')}\"))" ] }, { @@ -685,7 +682,7 @@ }, "outputs": [], "source": [ - "output_path <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", + "output_path <- OUTPUT_DIR\n", "\n", "# IQR detection table (for DB and reporting)\n", "outlier_col <- colnames(dhis2_routine_outliers_selection)[startsWith(colnames(dhis2_routine_outliers_selection), \"OUTLIER_\")][1]\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb index a927dd7..bfb371d 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb @@ -2,33 +2,36 @@ "cells": [ { "cell_type": "markdown", + "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", "metadata": {}, "source": [ "## **Détection des valeurs aberrantes — Méthode IQR**" - ], - "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "43794265-533f-4035-bf3d-975a3409507b", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "## CONFIGURATION ##" - ], - "execution_count": null, - "outputs": [], - "id": "43794265-533f-4035-bf3d-975a3409507b" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "2ced7513-0ee6-4b9b-ac07-124e510119af", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", @@ -39,23 +42,19 @@ "setup_ctx <- bootstrap_iqr_context(\n", " root_path = SNT_ROOT_PATH,\n", " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", - ")\n", - "\n", - "CODE_PATH <- setup_ctx$CODE_PATH\n", - "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "openhexa <- setup_ctx$openhexa" - ], - "execution_count": null, - "outputs": [], - "id": "2ced7513-0ee6-4b9b-ac07-124e510119af" + ")" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9e6b91b3-c196-4a1f-bc3d-a4bec5b90e51", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Load SNT config from bootstrap context\n", "config_json <- setup_ctx$config_json\n", @@ -69,32 +68,32 @@ "ADM_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", "facility_level <- config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL" - ], - "execution_count": null, - "outputs": [], - "id": "9e6b91b3-c196-4a1f-bc3d-a4bec5b90e51" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f8edc2a5-07ce-4507-9939-4322fc510593", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" - ], - "execution_count": null, - "outputs": [], - "id": "f8edc2a5-07ce-4507-9939-4322fc510593" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "25362e00-96b5-4200-be45-cdeeff9ce3ac", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# import routine data\n", "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine_outliers_detected.parquet\")) }, \n", @@ -119,26 +118,26 @@ " })\n", "\n", "printdim(routine_data)\n" - ], - "execution_count": null, - "outputs": [], - "id": "25362e00-96b5-4200-be45-cdeeff9ce3ac" + ] }, { "cell_type": "markdown", + "id": "c3cee574-8d66-4cd5-8fe6-97f39daa158b", "metadata": {}, "source": [ "### 1. Résumé des valeurs aberrantes détectées dans les données de routine" - ], - "id": "c3cee574-8d66-4cd5-8fe6-97f39daa158b" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "5ef732f5-52a8-4abc-87ba-7ca77f6c85f2", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "outlier_flags <- routine_data %>%\n", " # Keep only OUTLIER_* columns that are logical flags.\n", @@ -175,26 +174,26 @@ "}\n", "\n", "outlier_summary_long" - ], - "execution_count": null, - "outputs": [], - "id": "5ef732f5-52a8-4abc-87ba-7ca77f6c85f2" + ] }, { "cell_type": "markdown", + "id": "35bcc286-cde1-47bd-99ab-3a6f6b39ac5d", "metadata": {}, "source": [ "### 2. Visualisation des valeurs aberrantes (méthode IQR)" - ], - "id": "35bcc286-cde1-47bd-99ab-3a6f6b39ac5d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "abeae17b-935d-49d9-a239-89985e469d81", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "#--- PARAMETERS ---\n", "outlier_cols <- if (\"OUTLIER_DETECTED\" %in% names(routine_data)) {\n", @@ -203,29 +202,27 @@ " routine_data %>% select(starts_with(\"OUTLIER_\")) %>% names()\n", "}\n", "print(outlier_cols)" - ], - "execution_count": null, - "outputs": [], - "id": "abeae17b-935d-49d9-a239-89985e469d81" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "99dd199e-bcf1-4900-b96e-f0f3285caec2", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r\n", "# - plot_outliers()\n", "# - plot_outliers_by_district_facet_year()" - ], - "execution_count": null, - "outputs": [], - "id": "99dd199e-bcf1-4900-b96e-f0f3285caec2" + ] }, { "cell_type": "markdown", + "id": "764e6f6a-f810-4077-8ed4-6d5b24c4caf4", "metadata": {}, "source": [ "### Include plots \n", @@ -233,16 +230,18 @@ "-Clean folder \n", "-Save Images \n", "-Load the images " - ], - "id": "764e6f6a-f810-4077-8ed4-6d5b24c4caf4" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "8e4834fe-16d3-40ca-91a3-a38cd8e301d0", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Create folder if it doesn't exist\n", "output_dir <- file.path(getwd(), \"outputs/plots\")\n", @@ -252,18 +251,18 @@ " files <- list.files(output_dir, full.names = TRUE)\n", " if (length(files) > 0) file.remove(files)\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "8e4834fe-16d3-40ca-91a3-a38cd8e301d0" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "df34b8d6-489f-42d7-a9b3-3862edc6b780", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "selected_inds <- c(\"SUSP\", \"TEST\", \"CONF\")\n", "\n", @@ -285,21 +284,19 @@ "for (img in img_files) {\n", " IRdisplay::display_png(file = img)\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "df34b8d6-489f-42d7-a9b3-3862edc6b780" + ] }, { "cell_type": "markdown", + "id": "1ad3de61-1ff3-430b-83d8-f1c9fc924b43", "metadata": {}, "source": [ "### 3. Cohérence des indicateurs au niveau nationale" - ], - "id": "1ad3de61-1ff3-430b-83d8-f1c9fc924b43" + ] }, { "cell_type": "markdown", + "id": "bd92f817-83a5-4597-b547-c9f0a506c08a", "metadata": {}, "source": [ "La section ci-dessous est un extrait des explications fournies par la **Community code library for SNT**. Veuillez consulter le site Web pour obtenir des explications complètes: https://ahadi-analytics.github.io/snt-code-library/english/library/data/routine_cases/quality_control.html#cb19-55\n", @@ -319,16 +316,18 @@ "**Décès toutes causes confondues ≥ décès dus au paludisme**: un rapport de 1:1 implique que tous les décès déclarés sont attribués au paludisme. Lorsque les décès toutes causes confondues sont inférieurs aux décès dus au paludisme, cela reflète des problèmes de qualité des données liés à une déclaration inadéquate de l'une ou des deux variables.\n", "\n", "**Admissions pour paludisme ≥ décès dus au paludisme**: un rapport de 1:1 implique que tous les cas de paludisme hospitalisés sont décédés, soit un taux de mortalité hospitalière de 100 %. Lorsque les admissions pour paludisme sont inférieures aux décès dus au paludisme, cela reflète des problèmes de qualité des données liés à une déclaration inadéquate de l'une ou des deux variables." - ], - "id": "bd92f817-83a5-4597-b547-c9f0a506c08a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4ff0b337-2370-44ca-ac66-24068e7680c0", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Step 1: Extract year, month from PERIOD & aggregate\n", "routine_month <- routine_data_imputed %>%\n", @@ -345,18 +344,18 @@ " PRES = sum(PRES, na.rm = TRUE),\n", " .groups = \"drop\"\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "4ff0b337-2370-44ca-ac66-24068e7680c0" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "e183891f-c823-4fa2-9cbf-74143a8526fc", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Step 2: Plot monthly national trends\n", "options(repr.plot.width = 14, repr.plot.height = 6)\n", @@ -376,18 +375,18 @@ " legend.title = element_text(size = 16),\n", " legend.text = element_text(size = 16)\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "e183891f-c823-4fa2-9cbf-74143a8526fc" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "408b0396-346b-45a5-9358-6375400a6767", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Identify indicator columns automatically (all numeric except YEAR, MONTH, IDs)\n", "indicator_cols <- routine_data_imputed %>%\n", @@ -401,18 +400,18 @@ " ungroup()\n", "\n", "yearly_totals %>% select(YEAR, SUSP, TEST, CONF)" - ], - "execution_count": null, - "outputs": [], - "id": "408b0396-346b-45a5-9358-6375400a6767" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "54c415c8-8431-413c-9373-4d957776eff7", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Step 3: Create scatter plots\n", "routine_hd_month <- routine_data_imputed %>%\n", @@ -453,43 +452,43 @@ "\n", "# Step 3: Combine plots\n", "(p1 | p2 | p3) + plot_layout(guides = \"collect\")" - ], - "execution_count": null, - "outputs": [], - "id": "54c415c8-8431-413c-9373-4d957776eff7" + ] }, { "cell_type": "markdown", + "id": "61c5a165-af0e-45a4-9d2e-8901451a4d6b", "metadata": {}, "source": [ "Le graphique en bas montre le **pourcentage de rapports mensuels des formations sanitaires au niveau national** qui ont passé chaque contrôle de cohérence pour chaque année. Chaque cellule indique la proportion de rapports mensuels d’une année donnée qui respectent la règle de cohérence correspondante. Évaluer ces contrôles d’une année à l’autre et entre catégories permet d’identifier les **tendances générales de la qualité des données**." - ], - "id": "61c5a165-af0e-45a4-9d2e-8901451a4d6b" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4e530e1c-c6be-4de0-bc23-57ce1135ed59", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Coherence definitions loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r\n", "defs <- get_coherence_definitions()\n", "checks <- defs$checks\n", "check_labels <- defs$check_labels" - ], - "execution_count": null, - "outputs": [], - "id": "4e530e1c-c6be-4de0-bc23-57ce1135ed59" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "74f70f5c-a56d-4c47-bc5a-3cc828759d54", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# National coherence summary and plot via report utils\n", "coherency_metrics <- compute_national_coherency_metrics(\n", @@ -500,26 +499,26 @@ "\n", "coherency_plot <- plot_national_coherence_heatmap(coherency_metrics)\n", "coherency_plot" - ], - "execution_count": null, - "outputs": [], - "id": "74f70f5c-a56d-4c47-bc5a-3cc828759d54" + ] }, { "cell_type": "markdown", + "id": "da37c36d-bf7e-4ed7-8cfc-b377329b8d89", "metadata": {}, "source": [ "### 4. Visualisation de la cohérence au niveau du AMD1" - ], - "id": "da37c36d-bf7e-4ed7-8cfc-b377329b8d89" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "ca2d499c-4f17-4ec0-a090-3ca75daa914e", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# ADM coherence summaries via report utils\n", "adm_coherence_data <- compute_adm_coherence_long(\n", @@ -533,58 +532,58 @@ "adm_long <- adm_coherence_data$adm_long\n", "\n", "head(adm_long)" - ], - "execution_count": null, - "outputs": [], - "id": "ca2d499c-4f17-4ec0-a090-3ca75daa914e" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c159c47c-a2b1-411a-98fb-50d5173ebd0b", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" - ], - "execution_count": null, - "outputs": [], - "id": "c159c47c-a2b1-411a-98fb-50d5173ebd0b" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f11eb536-b633-40fb-b182-f67ea51e0c66", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Plot per year\n", "years_available <- sort(unique(adm_long$YEAR))\n", "for (year in years_available) {\n", " plot_coherence_heatmap(df = adm_long, selected_year = year, agg_level = \"ADM1_NAME\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "f11eb536-b633-40fb-b182-f67ea51e0c66" + ] }, { "cell_type": "markdown", + "id": "76315fa7-2fa2-4ea2-baf2-bc427b3f659a", "metadata": {}, "source": [ "### 5. Visualisation de la cohérence au niveau du AMD2" - ], - "id": "76315fa7-2fa2-4ea2-baf2-bc427b3f659a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "8930acc3-aafa-4ecb-9a1d-a48fc2faf1cd", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "shapes_data <- shapes_data %>%\n", " mutate(ADM2_ID = as.character(ADM2_ID))\n", @@ -600,32 +599,32 @@ " ADM2_NAME_shape = ADM2_NAME.x,\n", " ADM2_NAME_data = ADM2_NAME.y\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "8930acc3-aafa-4ecb-9a1d-a48fc2faf1cd" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "002ca971-fdff-4e12-ad0f-5daae51ba126", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" - ], - "execution_count": null, - "outputs": [], - "id": "002ca971-fdff-4e12-ad0f-5daae51ba126" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "88982e8e-43f4-4b6d-9f04-a61a03217c8d", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Loop over all available columns\n", "for (check_col in names(check_labels)) {\n", @@ -641,22 +640,19 @@ " # width = 14, height = 10, dpi = 300)\n", " }\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "88982e8e-43f4-4b6d-9f04-a61a03217c8d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "5d0b89bb-53c8-4d0d-aa39-71341956f1d0", "metadata": { "vscode": { "languageId": "r" } }, - "source": [], - "execution_count": null, "outputs": [], - "id": "5d0b89bb-53c8-4d0d-aa39-71341956f1d0" + "source": [] } ], "metadata": { @@ -676,4 +672,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r index 35204c2..64b49e0 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r @@ -10,6 +10,8 @@ bootstrap_iqr_context <- function( code_path <- file.path(root_path, "code") config_path <- file.path(root_path, "configuration") data_path <- file.path(root_path, "data") + output_dir <- file.path(data_path, "dhis2", "outliers_imputation") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) source(file.path(code_path, "snt_utils.r")) install_and_load(required_packages) @@ -39,6 +41,7 @@ bootstrap_iqr_context <- function( CODE_PATH = code_path, CONFIG_PATH = config_path, DATA_PATH = data_path, + OUTPUT_DIR = output_dir, openhexa = openhexa, config_json = config_json )) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index ce4fdc2..65c10ec 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -26,8 +26,6 @@ "# Parameters with safe defaults\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", "if (!exists(\"CONFIG_FILE_NAME\")) CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "# Partial step is always required by MG logic; complete step is optional.\n", - "RUN_MAGIC_GLASSES_PARTIAL <- TRUE\n", "if (!exists(\"RUN_MAGIC_GLASSES_COMPLETE\")) RUN_MAGIC_GLASSES_COMPLETE <- FALSE\n", "if (!exists(\"DEVIATION_MAD15\")) DEVIATION_MAD15 <- 15\n", "if (!exists(\"DEVIATION_MAD10\")) DEVIATION_MAD10 <- 10\n", @@ -43,13 +41,7 @@ "\n", "if (DEV_SUBSET_ADM1_N < 1) {\n", " stop(\"DEV_SUBSET_ADM1_N must be >= 1\")\n", - "}\n", - "\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", - "OUTPUT_DIR <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", - "dir.create(OUTPUT_DIR, recursive = TRUE, showWarnings = FALSE)" + "}" ] }, { @@ -76,17 +68,12 @@ ")\n", "\n", "setup_ctx <- mg_input$setup_ctx\n", - "config_json <- mg_input$config_json\n", "COUNTRY_CODE <- mg_input$country_code\n", "fixed_cols <- mg_input$fixed_cols\n", "indicators_to_keep <- mg_input$indicators_to_keep\n", "dhis2_routine <- mg_input$dhis2_routine\n", "dhis2_routine_long <- mg_input$dhis2_routine_long\n", - "\n", - "CODE_PATH <- setup_ctx$CODE_PATH\n", - "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa" + "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb index 2340133..cdeaf1a 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb @@ -36,12 +36,8 @@ " required_packages = c(\"jsonlite\", \"arrow\", \"glue\", \"reticulate\", \"dplyr\", \"ggplot2\", \"knitr\", \"scales\")\n", ")\n", "\n", - "CODE_PATH <- setup_ctx$CODE_PATH\n", - "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "DATA_PATH <- file.path(setup_ctx$DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "config_json <- fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", + "DATA_PATH <- setup_ctx$OUTPUT_DIR\n", + "config_json <- fromJSON(file.path(setup_ctx$CONFIG_PATH, \"SNT_config.json\"))\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", "\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r index 8cd4bb4..186a8fe 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r @@ -7,6 +7,8 @@ bootstrap_magic_glasses_context <- function( code_path <- file.path(root_path, "code") config_path <- file.path(root_path, "configuration") data_path <- file.path(root_path, "data") + output_dir <- file.path(data_path, "dhis2", "outliers_imputation") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) source(file.path(code_path, "snt_utils.r")) install_and_load(unique(required_packages)) @@ -26,6 +28,7 @@ bootstrap_magic_glasses_context <- function( CODE_PATH = code_path, CONFIG_PATH = config_path, DATA_PATH = data_path, + OUTPUT_DIR = output_dir, openhexa = openhexa )) } diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index 2a4ac29..952f86d 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -68,10 +68,7 @@ " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", ")\n", "\n", - "CODE_PATH <- setup_ctx$CODE_PATH\n", - "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa" + "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, { @@ -117,7 +114,7 @@ "source": [ "# Load SNT config from bootstrap context\n", "config_json <- setup_ctx$config_json\n", - "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" + "log_msg(glue(\"SNT configuration loaded from : {file.path(setup_ctx$CONFIG_PATH, 'SNT_config.json')}\"))" ] }, { @@ -683,7 +680,7 @@ }, "outputs": [], "source": [ - "output_path <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", + "output_path <- OUTPUT_DIR\n", "\n", "# Mean detection table (for DB and reporting)\n", "outlier_col <- colnames(dhis2_routine_outliers_selection)[startsWith(colnames(dhis2_routine_outliers_selection), \"OUTLIER_\")][1]\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb index c650d21..020f9a7 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb @@ -42,11 +42,7 @@ "setup_ctx <- bootstrap_outliers_context(\n", " root_path = SNT_ROOT_PATH,\n", " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", - ")\n", - "\n", - "CODE_PATH <- setup_ctx$CODE_PATH\n", - "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "openhexa <- setup_ctx$openhexa" + ")" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r index 474f342..2a0acf2 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r @@ -10,6 +10,8 @@ bootstrap_outliers_context <- function( code_path <- file.path(root_path, "code") config_path <- file.path(root_path, "configuration") data_path <- file.path(root_path, "data") + output_dir <- file.path(data_path, "dhis2", "outliers_imputation") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) source(file.path(code_path, "snt_utils.r")) install_and_load(required_packages) @@ -39,6 +41,7 @@ bootstrap_outliers_context <- function( CODE_PATH = code_path, CONFIG_PATH = config_path, DATA_PATH = data_path, + OUTPUT_DIR = output_dir, openhexa = openhexa, config_json = config_json )) diff --git a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb index b6cd2c7..e75f40e 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb @@ -68,10 +68,7 @@ " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", ")\n", "\n", - "CODE_PATH <- setup_ctx$CODE_PATH\n", - "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa" + "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, { @@ -117,7 +114,7 @@ "source": [ "# Load SNT config from bootstrap context\n", "config_json <- setup_ctx$config_json\n", - "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" + "log_msg(glue(\"SNT configuration loaded from : {file.path(setup_ctx$CONFIG_PATH, 'SNT_config.json')}\"))" ] }, { @@ -682,7 +679,7 @@ }, "outputs": [], "source": [ - "output_path <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", + "output_path <- OUTPUT_DIR\n", "\n", "# Median detection table (for DB and reporting)\n", "outlier_col <- colnames(dhis2_routine_outliers_selection)[startsWith(colnames(dhis2_routine_outliers_selection), \"OUTLIER_\")][1]\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb index 685f278..18bf856 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb @@ -42,11 +42,7 @@ "setup_ctx <- bootstrap_outliers_context(\n", " root_path = SNT_ROOT_PATH,\n", " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", - ")\n", - "\n", - "CODE_PATH <- setup_ctx$CODE_PATH\n", - "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "openhexa <- setup_ctx$openhexa" + ")" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r index 3b46c55..d0db1e3 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r @@ -10,6 +10,8 @@ bootstrap_outliers_context <- function( code_path <- file.path(root_path, "code") config_path <- file.path(root_path, "configuration") data_path <- file.path(root_path, "data") + output_dir <- file.path(data_path, "dhis2", "outliers_imputation") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) source(file.path(code_path, "snt_utils.r")) install_and_load(required_packages) @@ -39,6 +41,7 @@ bootstrap_outliers_context <- function( CODE_PATH = code_path, CONFIG_PATH = config_path, DATA_PATH = data_path, + OUTPUT_DIR = output_dir, openhexa = openhexa, config_json = config_json )) diff --git a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb index 3fb9d5e..8827371 100644 --- a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb @@ -66,27 +66,18 @@ "outputs": [], "source": [ "# Project folders (ROOT_PATH injected by pipeline if available)\n", - "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\" \n", + "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_path\")\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared helpers for this pipeline (code)\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_path.r\"))\n", + "setup_ctx <- bootstrap_path_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")" + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, { @@ -708,7 +699,7 @@ }, "outputs": [], "source": [ - "output_path <- file.path(DATA_PATH , \"dhis2\", \"outliers_imputation\")\n", + "output_path <- OUTPUT_DIR\n", "\n", "# Save routine outliers table (parquet)\n", "outliers_parquet <- file.path(output_path , paste0(COUNTRY_CODE, \"_routine_outliers_detected.parquet\")) \n", diff --git a/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r index 3348940..40cf610 100644 --- a/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r +++ b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r @@ -1,5 +1,39 @@ # Helpers for PATH outliers imputation notebook. +bootstrap_path_context <- function( + root_path = "~/workspace", + required_packages = c("arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", "reticulate", "glue"), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + output_dir <- file.path(data_path, "dhis2", "outliers_imputation") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + assign("openhexa", openhexa, envir = .GlobalEnv) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + OUTPUT_DIR = output_dir, + openhexa = openhexa + )) +} + build_path_routine_long <- function(dhis2_routine, DHIS2_INDICATORS) { dhis2_routine %>% dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM1_NAME", "ADM2_ID", "ADM2_NAME", "OU_ID", "OU_NAME", "PERIOD", DHIS2_INDICATORS))) %>% diff --git a/snt_dhis2_outliers_imputation_iqr/pipeline.py b/snt_dhis2_outliers_imputation_iqr/pipeline.py index c6bda5b..928acba 100644 --- a/snt_dhis2_outliers_imputation_iqr/pipeline.py +++ b/snt_dhis2_outliers_imputation_iqr/pipeline.py @@ -81,7 +81,6 @@ def snt_dhis2_outliers_imputation_iqr( if not run_report_only: input_params = { "ROOT_PATH": Path(workspace.files_path).as_posix(), - "OUTLIERS_METHOD": "IQR", "DEVIATION_IQR": deviation_iqr, } run_start_ts = time.time() diff --git a/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py b/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py index 29a678e..8844d9a 100644 --- a/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py +++ b/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py @@ -57,12 +57,9 @@ def snt_dhis2_outliers_imputation_magic_glasses( mode_clean = (mode or "partial").strip().lower() if mode_clean not in ("partial", "complete"): raise ValueError('mode must be "partial" or "complete".') - run_mg_partial = True run_mg_complete = mode_clean == "complete" current_run.log_info(f"Selected detection mode: {mode_clean}") - current_run.log_info( - f"Flags => RUN_MAGIC_GLASSES_PARTIAL={run_mg_partial}, RUN_MAGIC_GLASSES_COMPLETE={run_mg_complete}" - ) + current_run.log_info(f"RUN_MAGIC_GLASSES_COMPLETE={run_mg_complete}") if run_mg_complete: current_run.log_warning( "Complete mode selected: seasonal detection is very slow and can take several hours to run." @@ -101,8 +98,6 @@ def snt_dhis2_outliers_imputation_magic_glasses( input_params = { "ROOT_PATH": Path(workspace.files_path).as_posix(), - "OUTLIERS_METHOD": "MG_COMPLETE" if run_mg_complete else "MG_PARTIAL", - "RUN_MAGIC_GLASSES_PARTIAL": run_mg_partial, "RUN_MAGIC_GLASSES_COMPLETE": run_mg_complete, "DEVIATION_MAD15": 15, "DEVIATION_MAD10": 10, diff --git a/snt_dhis2_outliers_imputation_mean/pipeline.py b/snt_dhis2_outliers_imputation_mean/pipeline.py index be6cc22..013183b 100644 --- a/snt_dhis2_outliers_imputation_mean/pipeline.py +++ b/snt_dhis2_outliers_imputation_mean/pipeline.py @@ -81,7 +81,6 @@ def snt_dhis2_outliers_imputation_mean( if not run_report_only: input_params = { "ROOT_PATH": Path(workspace.files_path).as_posix(), - "OUTLIERS_METHOD": "MEAN", "DEVIATION_MEAN": deviation_mean, } run_start_ts = time.time() diff --git a/snt_dhis2_outliers_imputation_median/pipeline.py b/snt_dhis2_outliers_imputation_median/pipeline.py index 0a94ce5..220eb0e 100644 --- a/snt_dhis2_outliers_imputation_median/pipeline.py +++ b/snt_dhis2_outliers_imputation_median/pipeline.py @@ -81,7 +81,6 @@ def snt_dhis2_outliers_imputation_median( if not run_report_only: input_params = { "ROOT_PATH": Path(workspace.files_path).as_posix(), - "OUTLIERS_METHOD": "MEDIAN", "DEVIATION_MEDIAN": deviation_median, } run_start_ts = time.time() diff --git a/snt_dhis2_outliers_imputation_path/pipeline.py b/snt_dhis2_outliers_imputation_path/pipeline.py index d3a55a0..6511d24 100644 --- a/snt_dhis2_outliers_imputation_path/pipeline.py +++ b/snt_dhis2_outliers_imputation_path/pipeline.py @@ -84,7 +84,6 @@ def snt_dhis2_outliers_imputation_path( if not run_report_only: input_params = { "ROOT_PATH": Path(workspace.files_path).as_posix(), - "OUTLIERS_METHOD": "PATH", "DEVIATION_MEAN": deviation_mean, } run_start_ts = time.time() From 0d73d607b22fd4016006744129eec4ebfcdd2142 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 2 Apr 2026 12:19:13 +0200 Subject: [PATCH 16/17] Refactor outlier imputation notebooks and utility functions to centralize routine data loading and validation. Introduced `load_routine_data` function to streamline data retrieval and error handling, enhancing modularity and clarity across multiple pipelines. --- .../snt_dhis2_outliers_imputation_iqr.ipynb | 36 +++---------- .../utils/snt_dhis2_outliers_imputation_iqr.r | 31 ++++++++++++ ..._dhis2_outliers_imputation_magic_glasses.r | 50 +++++++++++++------ .../snt_dhis2_outliers_imputation_mean.ipynb | 37 +++----------- .../snt_dhis2_outliers_imputation_mean.r | 31 ++++++++++++ ...snt_dhis2_outliers_imputation_median.ipynb | 36 +++---------- .../snt_dhis2_outliers_imputation_median.r | 31 ++++++++++++ .../snt_dhis2_outliers_imputation_path.ipynb | 33 +++--------- .../snt_dhis2_outliers_imputation_path.r | 31 ++++++++++++ 9 files changed, 190 insertions(+), 126 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb index ae8d33a..e91dd17 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb @@ -128,16 +128,7 @@ }, "outputs": [], "source": [ - "# Check SNT configuration \n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") \n", - "for (conf in snt_config_mandatory) {\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", + "# Configuration validation is handled in pipeline.py\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", @@ -176,15 +167,12 @@ "source": [ "# Load file from dataset (formatting)\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", + "dhis2_routine <- load_routine_data(\n", + " dataset_name = dataset_name,\n", + " country_code = COUNTRY_CODE,\n", + " required_indicators = DHIS2_INDICATORS\n", + ")\n", "\n", - "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", - "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", "head(dhis2_routine, 2)" ] @@ -200,8 +188,7 @@ }, "outputs": [], "source": [ - "# YEAR and MONTH should be integers; in the input data they are numeric, but we later use them as integers\n", - "dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)" + "# YEAR/MONTH casting is handled inside load_routine_data()." ] }, { @@ -223,14 +210,7 @@ }, "outputs": [], "source": [ - "# Raise an error if any of DHIS2_INDICATORS are not present in the dhis2 routine data.\n", - "for (ind in DHIS2_INDICATORS) {\n", - " if (!(ind %in% colnames(dhis2_routine))) {\n", - " msg <- paste(\"[ERROR] Missing indicator column in routine data: \", ind)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}" + "# Indicator validation is handled inside load_routine_data()." ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r index 64b49e0..c1b471c 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r @@ -47,6 +47,37 @@ bootstrap_iqr_context <- function( )) } +load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (cast_year_month && all(c("YEAR", "MONTH") %in% colnames(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + if (!is.null(required_indicators)) { + missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) + if (length(missing_indicators) > 0) { + msg <- paste("[ERROR] Missing indicator column(s) in routine data:", paste(missing_indicators, collapse = ", ")) + log_msg(msg) + stop(msg) + } + } + + dhis2_routine +} + # Compute moving-average imputations for a selected outlier flag column. impute_outliers_dt <- function(dt, outlier_col) { dt <- data.table::as.data.table(dt) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r index 186a8fe..00e323b 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r @@ -33,6 +33,37 @@ bootstrap_magic_glasses_context <- function( )) } +load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (cast_year_month && all(c("YEAR", "MONTH") %in% colnames(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + if (!is.null(required_indicators)) { + missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) + if (length(missing_indicators) > 0) { + msg <- paste("[ERROR] Missing indicator column(s) in routine data:", paste(missing_indicators, collapse = ", ")) + log_msg(msg) + stop(msg) + } + } + + dhis2_routine +} + detect_outliers_mad_custom <- function(dt, deviation) { flag_col <- paste0("OUTLIER_MAD", deviation) dt <- data.table::copy(dt) @@ -177,22 +208,11 @@ prepare_magic_glasses_input <- function( indicators_to_keep <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED - dhis2_routine <- tryCatch( - { - get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) - }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") - log_msg(msg) - stop(msg) - } + dhis2_routine <- load_routine_data( + dataset_name = dataset_name, + country_code = country_code, + required_indicators = indicators_to_keep ) - log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) - log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) - - if (all(c("YEAR", "MONTH") %in% names(dhis2_routine))) { - dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) - } cols_to_select <- intersect(c(fixed_cols, indicators_to_keep), names(dhis2_routine)) dt_routine <- data.table::as.data.table(dhis2_routine)[, ..cols_to_select] diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index 952f86d..4ff4d3d 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -128,16 +128,7 @@ }, "outputs": [], "source": [ - "# Check SNT configuration \n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") \n", - "for (conf in snt_config_mandatory) {\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", + "# Configuration validation is handled in pipeline.py\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", @@ -176,16 +167,12 @@ "source": [ "# Load file from dataset (formatting)\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", + "dhis2_routine <- load_routine_data(\n", + " dataset_name = dataset_name,\n", + " country_code = COUNTRY_CODE,\n", + " required_indicators = DHIS2_INDICATORS\n", + ")\n", "\n", - "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", "head(dhis2_routine, 2)" ] @@ -201,8 +188,7 @@ }, "outputs": [], "source": [ - "# YEAR and MONTH should be integers; in the input data they are numeric, but we later use them as integers\n", - "dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)" + "# YEAR/MONTH casting is handled inside load_routine_data()." ] }, { @@ -224,14 +210,7 @@ }, "outputs": [], "source": [ - "# Raise an error if any of DHIS2_INDICATORS are not present in the dhis2 routine data.\n", - "for (ind in DHIS2_INDICATORS) {\n", - " if (!(ind %in% colnames(dhis2_routine))) {\n", - " msg <- paste(\"[ERROR] Missing indicator column in routine data: \", ind)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}" + "# Indicator validation is handled inside load_routine_data()." ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r index 2a0acf2..370f1fd 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r @@ -47,6 +47,37 @@ bootstrap_outliers_context <- function( )) } +load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (cast_year_month && all(c("YEAR", "MONTH") %in% colnames(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + if (!is.null(required_indicators)) { + missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) + if (length(missing_indicators) > 0) { + msg <- paste("[ERROR] Missing indicator column(s) in routine data:", paste(missing_indicators, collapse = ", ")) + log_msg(msg) + stop(msg) + } + } + + dhis2_routine +} + impute_outliers_dt <- function(dt, outlier_col) { dt <- data.table::as.data.table(dt) data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) diff --git a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb index e75f40e..9b6a64a 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb @@ -128,16 +128,7 @@ }, "outputs": [], "source": [ - "# Check SNT configuration \n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") \n", - "for (conf in snt_config_mandatory) {\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", + "# Configuration validation is handled in pipeline.py\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", @@ -176,15 +167,12 @@ "source": [ "# Load file from dataset (formatting)\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", + "dhis2_routine <- load_routine_data(\n", + " dataset_name = dataset_name,\n", + " country_code = COUNTRY_CODE,\n", + " required_indicators = DHIS2_INDICATORS\n", + ")\n", "\n", - "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", - "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", "head(dhis2_routine, 2)" ] @@ -200,8 +188,7 @@ }, "outputs": [], "source": [ - "# YEAR and MONTH should be integers; in the input data they are numeric, but we later use them as integers\n", - "dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)" + "# YEAR/MONTH casting is handled inside load_routine_data()." ] }, { @@ -223,14 +210,7 @@ }, "outputs": [], "source": [ - "# Raise an error if any of DHIS2_INDICATORS are not present in the dhis2 routine data.\n", - "for (ind in DHIS2_INDICATORS) {\n", - " if (!(ind %in% colnames(dhis2_routine))) {\n", - " msg <- paste(\"[ERROR] Missing indicator column in routine data: \", ind)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}" + "# Indicator validation is handled inside load_routine_data()." ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r index d0db1e3..982e9da 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r @@ -47,6 +47,37 @@ bootstrap_outliers_context <- function( )) } +load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (cast_year_month && all(c("YEAR", "MONTH") %in% colnames(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + if (!is.null(required_indicators)) { + missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) + if (length(missing_indicators) > 0) { + msg <- paste("[ERROR] Missing indicator column(s) in routine data:", paste(missing_indicators, collapse = ", ")) + log_msg(msg) + stop(msg) + } + } + + dhis2_routine +} + impute_outliers_dt <- function(dt, outlier_col) { dt <- data.table::as.data.table(dt) data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) diff --git a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb index 8827371..03f2461 100644 --- a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb @@ -143,16 +143,7 @@ }, "outputs": [], "source": [ - "# Check SNT configuration \n", - "for (conf in c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\")) {\n", - " print(glue(\"{conf} : {config_json$SNT_CONFIG[conf]}\"))\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", + "# Configuration validation is handled in pipeline.py\n", "# Set config vars\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", @@ -192,15 +183,12 @@ "source": [ "# Load file from dataset (formatting)\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", + "dhis2_routine <- load_routine_data(\n", + " dataset_name = dataset_name,\n", + " country_code = COUNTRY_CODE,\n", + " required_indicators = DHIS2_INDICATORS\n", + ")\n", "\n", - "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", - "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", "head(dhis2_routine, 4)" ] @@ -224,14 +212,7 @@ }, "outputs": [], "source": [ - "# Raise an error if any of DHIS2_INDICATORS are not present in the dhis2 routine data.\n", - "for (ind in DHIS2_INDICATORS) { \n", - " if (!(ind %in% colnames(dhis2_routine))) {\n", - " msg <- paste(\"[ERROR] Missing indicator column in routine data: \", ind)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}" + "# Indicator validation is handled inside load_routine_data()." ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r index 40cf610..f7e1817 100644 --- a/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r +++ b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r @@ -34,6 +34,37 @@ bootstrap_path_context <- function( )) } +load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (cast_year_month && all(c("YEAR", "MONTH") %in% colnames(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + if (!is.null(required_indicators)) { + missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) + if (length(missing_indicators) > 0) { + msg <- paste("[ERROR] Missing indicator column(s) in routine data:", paste(missing_indicators, collapse = ", ")) + log_msg(msg) + stop(msg) + } + } + + dhis2_routine +} + build_path_routine_long <- function(dhis2_routine, DHIS2_INDICATORS) { dhis2_routine %>% dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM1_NAME", "ADM2_ID", "ADM2_NAME", "OU_ID", "OU_NAME", "PERIOD", DHIS2_INDICATORS))) %>% From d122d5c2183b3b8d3efe3360cd4df28ba4f79c18 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 2 Apr 2026 16:29:27 +0200 Subject: [PATCH 17/17] final fix, everything is working --- .../utils/snt_dhis2_outliers_imputation_iqr.r | 51 ++++++++- ..._dhis2_outliers_imputation_magic_glasses.r | 101 ++++++++++++++++-- .../snt_dhis2_outliers_imputation_mean.r | 49 +++++++++ .../snt_dhis2_outliers_imputation_median.r | 49 +++++++++ .../snt_dhis2_outliers_imputation_path.ipynb | 13 +-- .../snt_dhis2_outliers_imputation_path.r | 86 ++++++++++++++- 6 files changed, 327 insertions(+), 22 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r index c1b471c..e480314 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r @@ -1,4 +1,18 @@ # Shared bootstrap for the IQR outliers pipeline notebooks. +# +# Function docs use lightweight R comments to keep notebooks readable +# while documenting expected inputs/outputs for analysts and maintainers. + +#' Initialize runtime context for the IQR outliers pipeline. +#' +#' Creates standard project paths, loads shared dependencies and utilities, +#' initializes OpenHEXA SDK access, loads SNT configuration, and returns a +#' single context object used by notebooks. +#' +#' @param root_path Project root folder (workspace). +#' @param required_packages Character vector of R packages to install/load. +#' @param load_openhexa Logical; import OpenHEXA SDK when TRUE. +#' @return Named list with paths, OpenHEXA handle, and parsed config. bootstrap_iqr_context <- function( root_path = "~/workspace", required_packages = c( @@ -24,6 +38,8 @@ bootstrap_iqr_context <- function( if (load_openhexa) { openhexa <- reticulate::import("openhexa.sdk") } + # snt_utils::log_msg() expects a global `openhexa` object. + assign("openhexa", openhexa, envir = .GlobalEnv) config_json <- tryCatch( { @@ -47,6 +63,17 @@ bootstrap_iqr_context <- function( )) } +#' Load DHIS2 routine input data with validation and logging. +#' +#' Reads the latest routine parquet file from OpenHEXA, logs dataset details, +#' optionally casts YEAR and MONTH to integers, and validates indicator columns. +#' Stops execution with a clear error when required fields are missing. +#' +#' @param dataset_name OpenHEXA dataset identifier/name. +#' @param country_code Country code used in routine filename prefix. +#' @param required_indicators Optional character vector of required indicators. +#' @param cast_year_month Logical; cast YEAR/MONTH columns to integer. +#' @return Data frame containing validated routine data. load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { dhis2_routine <- tryCatch( { @@ -78,7 +105,15 @@ load_routine_data <- function(dataset_name, country_code, required_indicators = dhis2_routine } -# Compute moving-average imputations for a selected outlier flag column. +#' Impute flagged outliers using a centered moving average. +#' +#' For each ADM/OU/indicator time series, values marked as outliers are +#' replaced by a 3-point centered moving average (ceiling), preserving +#' non-outlier observations. +#' +#' @param dt Routine data in long format. +#' @param outlier_col Name of the logical outlier flag column. +#' @return Data frame with VALUE_IMPUTED column and helper columns removed. impute_outliers_dt <- function(dt, outlier_col) { dt <- data.table::as.data.table(dt) data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) @@ -97,7 +132,19 @@ impute_outliers_dt <- function(dt, outlier_col) { return(as.data.frame(data.table::copy(dt))) } -# Format imputed/removed routine output tables. +#' Build final routine output tables (imputed or removed). +#' +#' Reshapes long-format routine values back to wide indicator columns, joins +#' location names, and standardizes output columns expected by downstream +#' datasets and reporting. +#' +#' @param df Long-format routine data including VALUE_IMPUTED. +#' @param outlier_column Outlier flag column used to filter removed records. +#' @param DHIS2_INDICATORS Indicator columns to keep in the final table. +#' @param fixed_cols Fixed identifier/date columns in long format. +#' @param pyramid_names Mapping table with ADM/OU names. +#' @param remove Logical; when TRUE returns outlier-removed data. +#' @return Wide routine data frame ready for export. format_routine_data_selection <- function( df, outlier_column, diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r index 00e323b..4aa5e57 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r @@ -1,4 +1,15 @@ # Main helpers for magic glasses outliers imputation pipeline. + +#' Initialize runtime context for the Magic Glasses pipeline. +#' +#' Creates standard project paths, loads shared dependencies and utilities, +#' initializes OpenHEXA SDK access, and returns a context object consumed by +#' downstream setup and processing functions. +#' +#' @param root_path Project root folder (workspace). +#' @param required_packages Character vector of R packages to install/load. +#' @param load_openhexa Logical; import OpenHEXA SDK when TRUE. +#' @return Named list with paths and OpenHEXA handle. bootstrap_magic_glasses_context <- function( root_path = "~/workspace", required_packages = c("arrow", "data.table", "jsonlite", "reticulate", "glue"), @@ -33,6 +44,17 @@ bootstrap_magic_glasses_context <- function( )) } +#' Load DHIS2 routine input data with validation and logging. +#' +#' Reads the latest routine parquet file from OpenHEXA, logs dataset details, +#' optionally casts YEAR and MONTH to integers, and validates indicator columns. +#' Stops execution with a clear error when required fields are missing. +#' +#' @param dataset_name OpenHEXA dataset identifier/name. +#' @param country_code Country code used in routine filename prefix. +#' @param required_indicators Optional character vector of required indicators. +#' @param cast_year_month Logical; cast YEAR/MONTH columns to integer. +#' @return Data frame containing validated routine data. load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { dhis2_routine <- tryCatch( { @@ -64,6 +86,14 @@ load_routine_data <- function(dataset_name, country_code, required_indicators = dhis2_routine } +#' Detect point outliers using MAD thresholding. +#' +#' Computes median and MAD by YEAR, OU_ID and INDICATOR, then flags observations +#' outside median +/- deviation * MAD. +#' +#' @param dt Long-format routine data table. +#' @param deviation Numeric MAD multiplier. +#' @return Data table with method-specific outlier flag column. detect_outliers_mad_custom <- function(dt, deviation) { flag_col <- paste0("OUTLIER_MAD", deviation) dt <- data.table::copy(dt) @@ -75,6 +105,16 @@ detect_outliers_mad_custom <- function(dt, deviation) { dt } +#' Detect seasonal outliers using cleaned time series residuals. +#' +#' Applies `forecast::tsclean` per OU/indicator series and flags observations +#' whose distance from cleaned signal exceeds a MAD-scaled deviation threshold. +#' Supports optional parallel execution when workers > 1. +#' +#' @param dt Long-format routine data table. +#' @param deviation Numeric threshold on scaled residuals. +#' @param workers Number of parallel workers for group processing. +#' @return Data table with seasonal outlier flag column. detect_seasonal_outliers <- function(dt, deviation, workers = 1) { outlier_col <- paste0("OUTLIER_SEASONAL", deviation) dt <- data.table::copy(dt) @@ -138,6 +178,16 @@ detect_seasonal_outliers <- function(dt, deviation, workers = 1) { result_dt } +#' Convert long routine data back to wide export format. +#' +#' Casts indicator rows to columns, joins administrative names, and guarantees +#' expected export columns exist with appropriate default types. +#' +#' @param dt_long Long-format routine data. +#' @param fixed_cols Fixed identifier/date columns. +#' @param indicators_to_keep Indicator columns expected in output. +#' @param pyramid_names Mapping table with ADM/OU names. +#' @return Wide routine data table ready for parquet export. to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_names) { routine_wide <- data.table::dcast( dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)], @@ -162,6 +212,19 @@ to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_nam routine_wide } +#' Prepare validated inputs for Magic Glasses detection. +#' +#' Bootstraps runtime context, loads configuration and routine input data, +#' validates required indicators, reshapes data to long format, deduplicates +#' keys, and optionally subsets data for development runs. +#' +#' @param root_path Project root folder (workspace). +#' @param config_file_name Configuration filename under configuration folder. +#' @param run_complete Logical; enable seasonal complete mode. +#' @param seasonal_workers Number of workers for seasonal detection. +#' @param dev_subset Logical; keep only a subset of ADM1 for development. +#' @param dev_subset_adm1_n Number of ADM1 values to keep in dev mode. +#' @return List with setup context, config variables and prepared data tables. prepare_magic_glasses_input <- function( root_path, config_file_name = "SNT_config.json", @@ -194,15 +257,6 @@ prepare_magic_glasses_input <- function( config_json <- jsonlite::fromJSON(file.path(setup_ctx$CONFIG_PATH, config_file_name)) - snt_config_mandatory <- c("COUNTRY_CODE", "DHIS2_ADMINISTRATION_1", "DHIS2_ADMINISTRATION_2") - for (conf in snt_config_mandatory) { - if (is.null(config_json$SNT_CONFIG[[conf]])) { - msg <- paste("Missing configuration input:", conf) - log_msg(msg) - stop(msg) - } - } - country_code <- config_json$SNT_CONFIG$COUNTRY_CODE fixed_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID") indicators_to_keep <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) @@ -264,6 +318,20 @@ prepare_magic_glasses_input <- function( ) } +#' Run Magic Glasses outlier detection workflow. +#' +#' Executes MAD15 then MAD10 detection, and optionally seasonal5 then seasonal3 +#' detection for complete mode, returning intermediate/final flag tables used by +#' export and reporting. +#' +#' @param dhis2_routine_long Long-format routine data. +#' @param deviation_mad15 MAD threshold for first pass. +#' @param deviation_mad10 MAD threshold for second pass. +#' @param run_complete Logical; run seasonal stages when TRUE. +#' @param deviation_seasonal5 Seasonal threshold for first seasonal pass. +#' @param deviation_seasonal3 Seasonal threshold for second seasonal pass. +#' @param seasonal_workers Number of workers for seasonal detection. +#' @return List with partial and complete outlier-flag tables. run_magic_glasses_outlier_detection <- function( dhis2_routine_long, deviation_mad15 = 15, @@ -344,6 +412,21 @@ run_magic_glasses_outlier_detection <- function( ) } +#' Export Magic Glasses outputs for datasets and downstream use. +#' +#' Builds unified detection table, writes imputed and removed routine outputs, +#' and chooses partial or complete outlier flag depending on execution mode. +#' +#' @param dhis2_routine_long Long-format routine data used as base. +#' @param flagged_outliers_mad15_mad10 Partial detection output. +#' @param flagged_outliers_seasonal5_seasonal3 Complete detection output. +#' @param run_complete Logical; export complete method flags when available. +#' @param dhis2_routine Original wide routine table for name mapping. +#' @param fixed_cols Fixed identifier/date columns. +#' @param indicators_to_keep Indicator columns expected in outputs. +#' @param output_dir Output folder path. +#' @param country_code Country code used in output filenames. +#' @return Invisible list with selected active outlier column metadata. export_magic_glasses_outputs <- function( dhis2_routine_long, flagged_outliers_mad15_mad10, diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r index 370f1fd..75d59ed 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r @@ -1,4 +1,18 @@ # Main helpers for mean outliers imputation pipeline. +# +# Function docs use lightweight R comments to keep notebooks readable +# while documenting expected inputs/outputs for analysts and maintainers. + +#' Initialize runtime context for the mean outliers pipeline. +#' +#' Creates standard project paths, loads shared dependencies and utilities, +#' initializes OpenHEXA SDK access, loads SNT configuration, and returns a +#' single context object used by notebooks. +#' +#' @param root_path Project root folder (workspace). +#' @param required_packages Character vector of R packages to install/load. +#' @param load_openhexa Logical; import OpenHEXA SDK when TRUE. +#' @return Named list with paths, OpenHEXA handle, and parsed config. bootstrap_outliers_context <- function( root_path = "~/workspace", required_packages = c( @@ -24,6 +38,8 @@ bootstrap_outliers_context <- function( if (load_openhexa) { openhexa <- reticulate::import("openhexa.sdk") } + # snt_utils::log_msg() expects a global `openhexa` object. + assign("openhexa", openhexa, envir = .GlobalEnv) config_json <- tryCatch( { @@ -47,6 +63,17 @@ bootstrap_outliers_context <- function( )) } +#' Load DHIS2 routine input data with validation and logging. +#' +#' Reads the latest routine parquet file from OpenHEXA, logs dataset details, +#' optionally casts YEAR and MONTH to integers, and validates indicator columns. +#' Stops execution with a clear error when required fields are missing. +#' +#' @param dataset_name OpenHEXA dataset identifier/name. +#' @param country_code Country code used in routine filename prefix. +#' @param required_indicators Optional character vector of required indicators. +#' @param cast_year_month Logical; cast YEAR/MONTH columns to integer. +#' @return Data frame containing validated routine data. load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { dhis2_routine <- tryCatch( { @@ -78,6 +105,15 @@ load_routine_data <- function(dataset_name, country_code, required_indicators = dhis2_routine } +#' Impute flagged outliers using a centered moving average. +#' +#' For each ADM/OU/indicator time series, values marked as outliers are +#' replaced by a 3-point centered moving average (ceiling), preserving +#' non-outlier observations. +#' +#' @param dt Routine data in long format. +#' @param outlier_col Name of the logical outlier flag column. +#' @return Data frame with VALUE_IMPUTED column and helper columns removed. impute_outliers_dt <- function(dt, outlier_col) { dt <- data.table::as.data.table(dt) data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) @@ -93,6 +129,19 @@ impute_outliers_dt <- function(dt, outlier_col) { return(as.data.frame(data.table::copy(dt))) } +#' Build final routine output tables (imputed or removed). +#' +#' Reshapes long-format routine values back to wide indicator columns, joins +#' location names, and standardizes output columns expected by downstream +#' datasets and reporting. +#' +#' @param df Long-format routine data including VALUE_IMPUTED. +#' @param outlier_column Outlier flag column used to filter removed records. +#' @param DHIS2_INDICATORS Indicator columns to keep in the final table. +#' @param fixed_cols Fixed identifier/date columns in long format. +#' @param pyramid_names Mapping table with ADM/OU names. +#' @param remove Logical; when TRUE returns outlier-removed data. +#' @return Wide routine data frame ready for export. format_routine_data_selection <- function( df, outlier_column, diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r index 982e9da..272ee05 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r @@ -1,4 +1,18 @@ # Main helpers for median outliers imputation pipeline. +# +# Function docs use lightweight R comments to keep notebooks readable +# while documenting expected inputs/outputs for analysts and maintainers. + +#' Initialize runtime context for the median outliers pipeline. +#' +#' Creates standard project paths, loads shared dependencies and utilities, +#' initializes OpenHEXA SDK access, loads SNT configuration, and returns a +#' single context object used by notebooks. +#' +#' @param root_path Project root folder (workspace). +#' @param required_packages Character vector of R packages to install/load. +#' @param load_openhexa Logical; import OpenHEXA SDK when TRUE. +#' @return Named list with paths, OpenHEXA handle, and parsed config. bootstrap_outliers_context <- function( root_path = "~/workspace", required_packages = c( @@ -24,6 +38,8 @@ bootstrap_outliers_context <- function( if (load_openhexa) { openhexa <- reticulate::import("openhexa.sdk") } + # snt_utils::log_msg() expects a global `openhexa` object. + assign("openhexa", openhexa, envir = .GlobalEnv) config_json <- tryCatch( { @@ -47,6 +63,17 @@ bootstrap_outliers_context <- function( )) } +#' Load DHIS2 routine input data with validation and logging. +#' +#' Reads the latest routine parquet file from OpenHEXA, logs dataset details, +#' optionally casts YEAR and MONTH to integers, and validates indicator columns. +#' Stops execution with a clear error when required fields are missing. +#' +#' @param dataset_name OpenHEXA dataset identifier/name. +#' @param country_code Country code used in routine filename prefix. +#' @param required_indicators Optional character vector of required indicators. +#' @param cast_year_month Logical; cast YEAR/MONTH columns to integer. +#' @return Data frame containing validated routine data. load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { dhis2_routine <- tryCatch( { @@ -78,6 +105,15 @@ load_routine_data <- function(dataset_name, country_code, required_indicators = dhis2_routine } +#' Impute flagged outliers using a centered rolling median. +#' +#' For each ADM/OU/indicator time series, values marked as outliers are +#' replaced by a 3-point centered rolling median (ceiling), preserving +#' non-outlier observations. +#' +#' @param dt Routine data in long format. +#' @param outlier_col Name of the logical outlier flag column. +#' @return Data frame with VALUE_IMPUTED column and helper columns removed. impute_outliers_dt <- function(dt, outlier_col) { dt <- data.table::as.data.table(dt) data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) @@ -93,6 +129,19 @@ impute_outliers_dt <- function(dt, outlier_col) { return(as.data.frame(data.table::copy(dt))) } +#' Build final routine output tables (imputed or removed). +#' +#' Reshapes long-format routine values back to wide indicator columns, joins +#' location names, and standardizes output columns expected by downstream +#' datasets and reporting. +#' +#' @param df Long-format routine data including VALUE_IMPUTED. +#' @param outlier_column Outlier flag column used to filter removed records. +#' @param DHIS2_INDICATORS Indicator columns to keep in the final table. +#' @param fixed_cols Fixed identifier/date columns in long format. +#' @param pyramid_names Mapping table with ADM/OU names. +#' @param remove Logical; when TRUE returns outlier-removed data. +#' @return Wide routine data frame ready for export. format_routine_data_selection <- function( df, outlier_column, diff --git a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb index 03f2461..0c57694 100644 --- a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb @@ -76,7 +76,6 @@ " required_packages = c(\"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\")\n", ")\n", "\n", - "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "OUTPUT_DIR <- setup_ctx$OUTPUT_DIR" ] }, @@ -121,15 +120,9 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", + "log_msg(glue(\"SNT configuration loaded from : {file.path(setup_ctx$CONFIG_PATH, 'SNT_config.json')}\"))" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r index f7e1817..288c12c 100644 --- a/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r +++ b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r @@ -1,5 +1,15 @@ # Helpers for PATH outliers imputation notebook. +#' Initialize runtime context for the PATH outliers pipeline. +#' +#' Creates standard project paths, loads shared dependencies and utilities, +#' initializes OpenHEXA SDK access, and returns a context object used by +#' notebooks. +#' +#' @param root_path Project root folder (workspace). +#' @param required_packages Character vector of R packages to install/load. +#' @param load_openhexa Logical; import OpenHEXA SDK when TRUE. +#' @return Named list with paths, OpenHEXA handle, and parsed config. bootstrap_path_context <- function( root_path = "~/workspace", required_packages = c("arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", "reticulate", "glue"), @@ -24,16 +34,39 @@ bootstrap_path_context <- function( } assign("openhexa", openhexa, envir = .GlobalEnv) + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + return(list( ROOT_PATH = root_path, CODE_PATH = code_path, CONFIG_PATH = config_path, DATA_PATH = data_path, OUTPUT_DIR = output_dir, - openhexa = openhexa + openhexa = openhexa, + config_json = config_json )) } +#' Load DHIS2 routine input data with validation and logging. +#' +#' Reads the latest routine parquet file from OpenHEXA, logs dataset details, +#' optionally casts YEAR and MONTH to integers, and validates indicator columns. +#' Stops execution with a clear error when required fields are missing. +#' +#' @param dataset_name OpenHEXA dataset identifier/name. +#' @param country_code Country code used in routine filename prefix. +#' @param required_indicators Optional character vector of required indicators. +#' @param cast_year_month Logical; cast YEAR/MONTH columns to integer. +#' @return Data frame containing validated routine data. load_routine_data <- function(dataset_name, country_code, required_indicators = NULL, cast_year_month = TRUE) { dhis2_routine <- tryCatch( { @@ -65,6 +98,15 @@ load_routine_data <- function(dataset_name, country_code, required_indicators = dhis2_routine } +#' Build PATH-ready long routine table. +#' +#' Selects required administrative/time columns, pivots indicator values to +#' long format, and completes missing PERIOD/INDICATOR combinations for each +#' location. +#' +#' @param dhis2_routine Routine data in wide format. +#' @param DHIS2_INDICATORS Indicator column names to pivot. +#' @return Long-format routine data frame used by PATH detection. build_path_routine_long <- function(dhis2_routine, DHIS2_INDICATORS) { dhis2_routine %>% dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM1_NAME", "ADM2_ID", "ADM2_NAME", "OU_ID", "OU_NAME", "PERIOD", DHIS2_INDICATORS))) %>% @@ -73,6 +115,13 @@ build_path_routine_long <- function(dhis2_routine, DHIS2_INDICATORS) { dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "INDICATOR", "VALUE"))) } +#' Remove duplicate observations in PATH long routine data. +#' +#' Detects duplicate keys at ADM/OU/PERIOD/INDICATOR level, logs duplicate +#' counts, and keeps distinct rows only when duplicates exist. +#' +#' @param dhis2_routine_long Long-format routine data frame. +#' @return List with cleaned `data` and `duplicated` summary table. remove_path_duplicates <- function(dhis2_routine_long) { duplicated <- dhis2_routine_long %>% dplyr::group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>% @@ -88,6 +137,15 @@ remove_path_duplicates <- function(dhis2_routine_long) { list(data = dhis2_routine_long, duplicated = duplicated) } +#' Detect potential stock-out exceptions in PATH logic. +#' +#' Flags periods where PRES is marked outlier while TEST is unusually low and +#' PRES remains within a reasonable upper range, indicating likely stock-out +#' behavior rather than true anomaly. +#' +#' @param dhis2_routine_outliers Routine table with OUTLIER_TREND and stats. +#' @param MEAN_DEVIATION Deviation multiplier used in PATH thresholds. +#' @return Data frame of flagged stock-out exception keys. detect_possible_stockout <- function(dhis2_routine_outliers, MEAN_DEVIATION) { low_testing_periods <- dhis2_routine_outliers %>% dplyr::filter(INDICATOR == "TEST") %>% @@ -105,6 +163,15 @@ detect_possible_stockout <- function(dhis2_routine_outliers, MEAN_DEVIATION) { dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "POSSIBLE_STKOUT"))) } +#' Detect potential epidemic exceptions in PATH logic. +#' +#' Identifies periods where CONF is outlier and TEST also supports epidemic +#' behavior (test outlier or TEST >= CONF), so values should not be suppressed +#' as reporting anomalies. +#' +#' @param dhis2_routine_outliers Routine table with OUTLIER_TREND and stats. +#' @param MEAN_DEVIATION Deviation multiplier used in PATH thresholds. +#' @return Data frame of flagged epidemic exception keys. detect_possible_epidemic <- function(dhis2_routine_outliers, MEAN_DEVIATION) { dhis2_routine_outliers %>% dplyr::filter(INDICATOR == "TEST" | INDICATOR == "CONF") %>% @@ -121,6 +188,15 @@ detect_possible_epidemic <- function(dhis2_routine_outliers, MEAN_DEVIATION) { dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "POSSIBLE_EPID"))) } +#' Apply PATH exception logic and build cleaned outlier table. +#' +#' Joins stock-out and epidemic exception flags, updates OUTLIER_TREND after +#' exception rules, and standardizes key output columns including YEAR/MONTH. +#' +#' @param dhis2_routine_outliers Base PATH outlier table. +#' @param possible_stockout Output from `detect_possible_stockout`. +#' @param possible_epidemic Output from `detect_possible_epidemic`. +#' @return Cleaned long-format outlier table for imputation/export. build_path_clean_outliers <- function(dhis2_routine_outliers, possible_stockout, possible_epidemic) { dhis2_routine_outliers %>% dplyr::left_join(possible_stockout, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% @@ -140,6 +216,14 @@ build_path_clean_outliers <- function(dhis2_routine_outliers, possible_stockout, ))) } +#' Impute PATH outliers and enforce TEST/CONF consistency. +#' +#' Replaces flagged values using MEAN_80, reshapes data to evaluate TEST vs CONF +#' consistency, and reverts impossible imputations when they create TEST < CONF +#' while original values were logically consistent. +#' +#' @param routine_data_outliers_clean Clean outlier table from PATH logic. +#' @return Long-format routine table with VALUE_OLD, VALUE_IMPUTED and flags. impute_path_outliers <- function(routine_data_outliers_clean) { routine_data_outliers_clean %>% dplyr::rename(VALUE_OLD = VALUE) %>%