From f5af3bfdd2beceb8884b220a5a5109ce20eb04f6 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 24 Mar 2026 13:05:31 +0100 Subject: [PATCH 01/23] Refactor outlier imputation notebooks to utilize a shared bootstrap context for configuration and package management. Introduced utility functions for imputation and reporting, enhancing modularity and readability. Updated paths and error handling for loading configurations. --- .../snt_dhis2_outliers_imputation_iqr.ipynb | 94 +++----- ...dhis2_outliers_imputation_iqr_report.ipynb | 214 ++---------------- .../utils/bootstrap.R | 45 ++++ .../utils/imputation_utils.R | 47 ++++ .../utils/reporting_utils.R | 166 ++++++++++++++ 5 files changed, 312 insertions(+), 254 deletions(-) create mode 100644 pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb index 9d03d45..dd66522 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb @@ -59,24 +59,22 @@ "source": [ "# Project folders (ROOT_PATH injected by pipeline if not set)\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_iqr\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_iqr_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c( \"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Pipeline-specific helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" ] }, { @@ -120,14 +118,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" ] }, @@ -536,23 +528,8 @@ }, "outputs": [], "source": [ - "# Define helper function to compute moving average for an outlier column\n", - "start_time <- Sys.time()\n", - "\n", - "impute_outliers_dt <- function(dt, outlier_col) {\n", - " dt <- as.data.table(dt) # transform to datatable\n", - " setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) \n", - " dt[, TO_IMPUTE := fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] # Compute TO_IMPUTE column\n", - " \n", - " # Fast rolling mean by group\n", - " dt[, MOVING_AVG := frollapply(TO_IMPUTE, n = 3, FUN = function(x) ceiling(mean(x, na.rm = TRUE)), align = \"center\"), \n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)]\n", - " \n", - " dt[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] \n", - " dt[, c(\"TO_IMPUTE\") := NULL] # clean up \"MOVING_AVG\"\n", - " \n", - " return(as.data.frame(copy(dt)))\n", - "}" + "# Helper loaded from utils/imputation_utils.R\n", + "start_time <- Sys.time()" ] }, { @@ -629,24 +606,7 @@ }, "outputs": [], "source": [ - "# Define helper function to format both versions \n", - "format_routine_data_selection <- function(df, outlier_column, remove = FALSE) {\n", - " \n", - " # remove outliers \n", - " if (remove) df <- df %>% filter(!.data[[outlier_column]])\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", DHIS2_INDICATORS)\n", - " \n", - " output <- df %>%\n", - " select(-VALUE) %>%\n", - " rename(VALUE = VALUE_IMPUTED) %>%\n", - " select(all_of(fixed_cols), INDICATOR, VALUE) %>% # global: fixed_cols\n", - " mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>%\n", - " pivot_wider(names_from = \"INDICATOR\", values_from = \"VALUE\") %>%\n", - " left_join(pyramid_names, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"))\n", - "\n", - " output %>% select(all_of(intersect(target_cols, names(output))))\n", - "}" + "# Helper loaded from utils/imputation_utils.R" ] }, { @@ -661,8 +621,22 @@ "outputs": [], "source": [ "# Format IQR tables (imputed and removed)\n", - "dhis2_routine_iqr_imputed <- format_routine_data_selection(dhis2_routine_outliers_iqr_imputed, iqr_column)\n", - "dhis2_routine_iqr_removed <- format_routine_data_selection(dhis2_routine_outliers_iqr_imputed, iqr_column, remove = TRUE)" + "dhis2_routine_iqr_imputed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_iqr_imputed,\n", + " outlier_column = iqr_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names\n", + ")\n", + "\n", + "dhis2_routine_iqr_removed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_iqr_imputed,\n", + " outlier_column = iqr_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names,\n", + " remove = TRUE\n", + ")" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb index 5e9c896..2fea8de 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb @@ -35,22 +35,21 @@ "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_iqr\")\n", "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_iqr_context(\n", + " root_path = SNT_ROOT_PATH,\n", + " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + ")\n", "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Reporting helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" ] }, { @@ -64,13 +63,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "\n", "# Configuration variables\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", @@ -94,10 +88,7 @@ }, "outputs": [], "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" + "# Helper loaded from utils/reporting_utils.R" ] }, { @@ -231,76 +222,9 @@ }, "outputs": [], "source": [ - "#--- FUNCTIONS TO MAKE ONE PLOT ---\n", - "plot_outliers <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>% filter(INDICATOR == ind_name)\n", - "\n", - " # Remove infinite or impossible values explicitly → removes warnings\n", - " df_ind <- df_ind %>% \n", - " filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE))\n", - "\n", - " p <- ggplot(df_ind, aes(x = YEAR, y = VALUE)) +\n", - " \n", - " # All values (grey)\n", - " geom_point(alpha = 0.25, color = \"grey40\", na.rm = TRUE) +\n", - " \n", - " # Outliers (red)\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " aes(x = YEAR, y = VALUE),\n", - " color = \"red\",\n", - " size = 2.8,\n", - " alpha = 0.85,\n", - " na.rm = TRUE\n", - " ) +\n", - " \n", - " labs(\n", - " title = paste(\"Inspection des valeurs aberrantes pour indicateur:\", ind_name),\n", - " subtitle = \"Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées\",\n", - " x = \"Année\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 14)\n", - "\n", - " return(p)\n", - "}\n", - "\n", - "#plots <- map(unique_inds, ~ plot_outliers(.x, routine_data, outlier_col))\n", - "#walk(plots, print)\n", - "\n", - "plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>%\n", - " filter(\n", - " INDICATOR == ind_name,\n", - " !is.na(YEAR),\n", - " !is.na(VALUE),\n", - " is.finite(VALUE)\n", - " )\n", - " \n", - " if (nrow(df_ind) == 0) return(NULL)\n", - " \n", - " ggplot(df_ind, aes(x = ADM2_ID, y = VALUE)) +\n", - " geom_point(color = \"grey60\", alpha = 0.3) +\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " color = \"red\", \n", - " size = 2.8,\n", - " alpha = 0.85\n", - " ) +\n", - " facet_wrap(~ YEAR, scales = \"free_y\") +\n", - " labs(\n", - " title = paste(\"Détection des valeurs aberrantes —\", ind_name),\n", - " subtitle = paste(\"Méthode :\", outlier_col, \"| Rouge = valeur aberrante\"),\n", - " x = \"District (ADM2)\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 75, hjust = 1, size = 7)\n", - " )\n", - "}" + "# Plot helpers loaded from utils/reporting_utils.R\n", + "# - plot_outliers()\n", + "# - plot_outliers_by_district_facet_year()" ] }, { @@ -747,69 +671,7 @@ }, "outputs": [], "source": [ - "# Define heatmap function\n", - "plot_coherence_heatmap <- function(df, selected_year, agg_level = \"ADM1_NAME\", filename = NULL, do_plot = TRUE) {\n", - " \n", - " if (!agg_level %in% names(df)) {\n", - " stop(paste0(\"Aggregation level '\", agg_level, \"' not found in data!\"))\n", - " }\n", - " \n", - " # Aggregate pct_coherent by chosen level + check_label\n", - " df_year <- df %>%\n", - " filter(YEAR == selected_year) %>%\n", - " group_by(across(all_of(c(agg_level, \"check_label\")))) %>%\n", - " summarise(\n", - " pct_coherent = mean(pct_coherent, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " group_by(across(all_of(agg_level))) %>%\n", - " mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>%\n", - " ungroup() %>%\n", - " mutate(!!agg_level := fct_reorder(.data[[agg_level]], median_coh))\n", - " \n", - " n_units <- n_distinct(df_year[[agg_level]])\n", - " plot_height <- max(6, 0.5 * n_units) # dynamically adjust height\n", - " agg_label <- if (agg_level == \"ADM1_NAME\") {\n", - " \"niveau administratif 1\"\n", - " } else if (agg_level == \"ADM2_NAME\") {\n", - " \"niveau administratif 2\"\n", - " } else {\n", - " agg_level # fallback, in case a different level is passed\n", - " }\n", - " \n", - " p <- ggplot(df_year, aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) +\n", - " geom_tile(color = \"white\", linewidth = 0.2) +\n", - " geom_text(aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " size = 5, fontface = \"bold\", color = \"white\") +\n", - " scale_fill_viridis(name = \"% cohérent\", limits = c(0, 100),\n", - " option = \"viridis\", direction = -1) +\n", - " labs(\n", - " title = paste0(\"Cohérence des données par \", agg_label, \" - \", selected_year),\n", - " x = \"Règle de cohérence\",\n", - " y = agg_label\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " axis.text.y = element_text(size = 12),\n", - " axis.text.x = element_text(size = 12, angle = 30, hjust = 1),\n", - " plot.title = element_text(size = 16, face = \"bold\", hjust = 0.5),\n", - " legend.title = element_text(size = 12),\n", - " legend.text = element_text(size = 10)\n", - " )\n", - " \n", - " # Adjust notebook display\n", - " options(repr.plot.width = 14, repr.plot.height = plot_height)\n", - " \n", - " # Save if filename is provided\n", - " if (!is.null(filename)) {\n", - " ggsave(filename = filename, plot = p,\n", - " width = 14, height = plot_height, dpi = 300,\n", - " limitsize = FALSE)\n", - " }\n", - " if (do_plot) { print(p) }\n", - " # return(p)\n", - "}" + "# Coherence heatmap helper loaded from utils/reporting_utils.R" ] }, { @@ -876,43 +738,7 @@ }, "outputs": [], "source": [ - "# Define function\n", - "plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) {\n", - " \n", - " # Check if column exists\n", - " if (!col_name %in% names(map_data)) {\n", - " stop(paste0(\"Column '\", col_name, \"' not found in the data!\"))\n", - " }\n", - " \n", - " # Default legend title if not provided\n", - " if (is.null(indicator_label)) {\n", - " indicator_label <- col_name\n", - " }\n", - " \n", - " ggplot(map_data) +\n", - " geom_sf(aes(fill = .data[[col_name]]), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis(\n", - " name = paste0(\"% cohérence\\n(\", indicator_label, \")\"),\n", - " option = \"magma\",\n", - " direction = -1,\n", - " limits = c(0, 100),\n", - " na.value = \"grey90\"\n", - " ) +\n", - " # facet_wrap(~ YEAR) +\n", - " facet_wrap(~ YEAR, drop = TRUE) +\n", - " labs(\n", - " title = \"Cohérence des données par niveau administratif 2 et par année\",\n", - " subtitle = paste(\"Indicateur :\", indicator_label),\n", - " caption = \"Source : DHIS2 données routinières\"\n", - " ) +\n", - " theme_minimal(base_size = 15) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " strip.text = element_text(size = 14, face = \"bold\"),\n", - " plot.title = element_text(size = 20, face = \"bold\"),\n", - " legend.position = \"right\"\n", - " )\n", - "}\n" + "# Coherence map helper loaded from utils/reporting_utils.R" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R new file mode 100644 index 0000000..4f9e7b3 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R @@ -0,0 +1,45 @@ +# Shared bootstrap for the IQR outliers pipeline notebooks. +bootstrap_iqr_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R new file mode 100644 index 0000000..da3b1e9 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R @@ -0,0 +1,47 @@ +# Compute moving-average imputations for a selected outlier flag column. +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + + dt[, MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + dt[, c("TO_IMPUTE") := NULL] + + return(as.data.frame(data.table::copy(dt))) +} + +# Format imputed/removed routine output tables. +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R new file mode 100644 index 0000000..cc6c620 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R @@ -0,0 +1,166 @@ +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), + subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", + x = "Année", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 14) +} + +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + + if (nrow(df_ind) == 0) { + return(NULL) + } + + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Détection des valeurs aberrantes —", ind_name), + subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), + x = "District (ADM2)", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7) + ) +} + +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!agg_level %in% names(df)) { + stop(paste0("Aggregation level '", agg_level, "' not found in data!")) + } + + df_year <- df %>% + dplyr::filter(YEAR == selected_year) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% + dplyr::summarise( + pct_coherent = mean(pct_coherent, na.rm = TRUE), + .groups = "drop" + ) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% + dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% + dplyr::ungroup() %>% + dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) + + n_units <- dplyr::n_distinct(df_year[[agg_level]]) + plot_height <- max(6, 0.5 * n_units) + agg_label <- if (agg_level == "ADM1_NAME") { + "niveau administratif 1" + } else if (agg_level == "ADM2_NAME") { + "niveau administratif 2" + } else { + agg_level + } + + p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + + ggplot2::geom_tile(color = "white", linewidth = 0.2) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), + size = 5, + fontface = "bold", + color = "white" + ) + + viridis::scale_fill_viridis( + name = "% cohérent", + limits = c(0, 100), + option = "viridis", + direction = -1 + ) + + ggplot2::labs( + title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), + x = "Règle de cohérence", + y = agg_label + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + axis.text.y = ggplot2::element_text(size = 12), + axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), + plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), + legend.title = ggplot2::element_text(size = 12), + legend.text = ggplot2::element_text(size = 10) + ) + + options(repr.plot.width = 14, repr.plot.height = plot_height) + + if (!is.null(filename)) { + ggplot2::ggsave( + filename = filename, + plot = p, + width = 14, + height = plot_height, + dpi = 300, + limitsize = FALSE + ) + } + if (do_plot) { + print(p) + } +} + +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!col_name %in% names(map_data)) { + stop(paste0("Column '", col_name, "' not found in the data!")) + } + + if (is.null(indicator_label)) { + indicator_label <- col_name + } + + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + + viridis::scale_fill_viridis( + name = paste0("% cohérence\n(", indicator_label, ")"), + option = "magma", + direction = -1, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::facet_wrap(~ YEAR, drop = TRUE) + + ggplot2::labs( + title = "Cohérence des données par niveau administratif 2 et par année", + subtitle = paste("Indicateur :", indicator_label), + caption = "Source : DHIS2 données routinières" + ) + + ggplot2::theme_minimal(base_size = 15) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + strip.text = ggplot2::element_text(size = 14, face = "bold"), + plot.title = ggplot2::element_text(size = 20, face = "bold"), + legend.position = "right" + ) +} From eb07fe097cfbe070da6f1bc366227dfcb147cd38 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 24 Mar 2026 15:28:33 +0100 Subject: [PATCH 02/23] Refactor outlier imputation notebooks to leverage a shared bootstrap context for configuration and package management. Introduced utility functions for imputation and reporting, improving modularity and readability. Updated paths and error handling for loading configurations across both mean and median imputation pipelines. --- .../snt_dhis2_outliers_imputation_mean.ipynb | 94 +++----- ...his2_outliers_imputation_mean_report.ipynb | 214 ++---------------- .../utils/bootstrap.R | 45 ++++ .../utils/imputation_utils.R | 39 ++++ .../utils/reporting_utils.R | 124 ++++++++++ ...snt_dhis2_outliers_imputation_median.ipynb | 94 +++----- ...s2_outliers_imputation_median_report.ipynb | 214 ++---------------- .../utils/bootstrap.R | 45 ++++ .../utils/imputation_utils.R | 39 ++++ .../utils/reporting_utils.R | 124 ++++++++++ 10 files changed, 524 insertions(+), 508 deletions(-) create mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index b03e267..2f73668 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -59,24 +59,22 @@ "source": [ "# Project folders (ROOT_PATH injected by pipeline if not set)\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_mean\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c( \"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Pipeline-specific helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" ] }, { @@ -120,14 +118,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" ] }, @@ -533,23 +525,8 @@ }, "outputs": [], "source": [ - "# Define helper function to compute moving average for an outlier column\n", - "start_time <- Sys.time()\n", - "\n", - "impute_outliers_dt <- function(dt, outlier_col) {\n", - " dt <- as.data.table(dt) # transform to datatable\n", - " setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) \n", - " dt[, TO_IMPUTE := fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] # Compute TO_IMPUTE column\n", - " \n", - " # Fast rolling mean by group\n", - " dt[, MOVING_AVG := frollapply(TO_IMPUTE, n = 3, FUN = function(x) ceiling(mean(x, na.rm = TRUE)), align = \"center\"), \n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)]\n", - " \n", - " dt[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] \n", - " dt[, c(\"TO_IMPUTE\") := NULL] # clean up \"MOVING_AVG\"\n", - " \n", - " return(as.data.frame(copy(dt)))\n", - "}" + "# Helper loaded from utils/imputation_utils.R\n", + "start_time <- Sys.time()" ] }, { @@ -626,24 +603,7 @@ }, "outputs": [], "source": [ - "# Define helper function to format both versions \n", - "format_routine_data_selection <- function(df, outlier_column, remove = FALSE) {\n", - " \n", - " # remove outliers \n", - " if (remove) df <- df %>% filter(!.data[[outlier_column]])\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", DHIS2_INDICATORS)\n", - " \n", - " output <- df %>%\n", - " select(-VALUE) %>%\n", - " rename(VALUE = VALUE_IMPUTED) %>%\n", - " select(all_of(fixed_cols), INDICATOR, VALUE) %>% # global: fixed_cols\n", - " mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>%\n", - " pivot_wider(names_from = \"INDICATOR\", values_from = \"VALUE\") %>%\n", - " left_join(pyramid_names, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"))\n", - "\n", - " output %>% select(all_of(intersect(target_cols, names(output))))\n", - "}" + "# Helper loaded from utils/imputation_utils.R" ] }, { @@ -658,8 +618,22 @@ "outputs": [], "source": [ "# Format mean tables (imputed and removed)\n", - "dhis2_routine_mean_imputed <- format_routine_data_selection(dhis2_routine_outliers_mean_imputed, mean_column)\n", - "dhis2_routine_mean_removed <- format_routine_data_selection(dhis2_routine_outliers_mean_imputed, mean_column, remove = TRUE)" + "dhis2_routine_mean_imputed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_mean_imputed,\n", + " outlier_column = mean_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names\n", + ")\n", + "\n", + "dhis2_routine_mean_removed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_mean_imputed,\n", + " outlier_column = mean_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names,\n", + " remove = TRUE\n", + ")" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb index 9ced93f..ce58bf5 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb @@ -35,22 +35,21 @@ "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_mean\")\n", "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = SNT_ROOT_PATH,\n", + " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + ")\n", "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Reporting helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" ] }, { @@ -64,13 +63,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "\n", "# Configuration variables\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", @@ -94,10 +88,7 @@ }, "outputs": [], "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" + "# Helper loaded from utils/reporting_utils.R" ] }, { @@ -231,76 +222,9 @@ }, "outputs": [], "source": [ - "#--- FUNCTIONS TO MAKE ONE PLOT ---\n", - "plot_outliers <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>% filter(INDICATOR == ind_name)\n", - "\n", - " # Remove infinite or impossible values explicitly → removes warnings\n", - " df_ind <- df_ind %>% \n", - " filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE))\n", - "\n", - " p <- ggplot(df_ind, aes(x = YEAR, y = VALUE)) +\n", - " \n", - " # All values (grey)\n", - " geom_point(alpha = 0.25, color = \"grey40\", na.rm = TRUE) +\n", - " \n", - " # Outliers (red)\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " aes(x = YEAR, y = VALUE),\n", - " color = \"red\",\n", - " size = 2.8,\n", - " alpha = 0.85,\n", - " na.rm = TRUE\n", - " ) +\n", - " \n", - " labs(\n", - " title = paste(\"Inspection des valeurs aberrantes pour indicateur:\", ind_name),\n", - " subtitle = \"Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées\",\n", - " x = \"Année\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 14)\n", - "\n", - " return(p)\n", - "}\n", - "\n", - "#plots <- map(unique_inds, ~ plot_outliers(.x, routine_data, outlier_col))\n", - "#walk(plots, print)\n", - "\n", - "plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>%\n", - " filter(\n", - " INDICATOR == ind_name,\n", - " !is.na(YEAR),\n", - " !is.na(VALUE),\n", - " is.finite(VALUE)\n", - " )\n", - " \n", - " if (nrow(df_ind) == 0) return(NULL)\n", - " \n", - " ggplot(df_ind, aes(x = ADM2_ID, y = VALUE)) +\n", - " geom_point(color = \"grey60\", alpha = 0.3) +\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " color = \"red\", \n", - " size = 2.8,\n", - " alpha = 0.85\n", - " ) +\n", - " facet_wrap(~ YEAR, scales = \"free_y\") +\n", - " labs(\n", - " title = paste(\"Détection des valeurs aberrantes —\", ind_name),\n", - " subtitle = paste(\"Méthode :\", outlier_col, \"| Rouge = valeur aberrante\"),\n", - " x = \"District (ADM2)\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 75, hjust = 1, size = 7)\n", - " )\n", - "}" + "# Plot helpers loaded from utils/reporting_utils.R\n", + "# - plot_outliers()\n", + "# - plot_outliers_by_district_facet_year()" ] }, { @@ -747,69 +671,7 @@ }, "outputs": [], "source": [ - "# Define heatmap function\n", - "plot_coherence_heatmap <- function(df, selected_year, agg_level = \"ADM1_NAME\", filename = NULL, do_plot = TRUE) {\n", - " \n", - " if (!agg_level %in% names(df)) {\n", - " stop(paste0(\"Aggregation level '\", agg_level, \"' not found in data!\"))\n", - " }\n", - " \n", - " # Aggregate pct_coherent by chosen level + check_label\n", - " df_year <- df %>%\n", - " filter(YEAR == selected_year) %>%\n", - " group_by(across(all_of(c(agg_level, \"check_label\")))) %>%\n", - " summarise(\n", - " pct_coherent = mean(pct_coherent, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " group_by(across(all_of(agg_level))) %>%\n", - " mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>%\n", - " ungroup() %>%\n", - " mutate(!!agg_level := fct_reorder(.data[[agg_level]], median_coh))\n", - " \n", - " n_units <- n_distinct(df_year[[agg_level]])\n", - " plot_height <- max(6, 0.5 * n_units) # dynamically adjust height\n", - " agg_label <- if (agg_level == \"ADM1_NAME\") {\n", - " \"niveau administratif 1\"\n", - " } else if (agg_level == \"ADM2_NAME\") {\n", - " \"niveau administratif 2\"\n", - " } else {\n", - " agg_level # fallback, in case a different level is passed\n", - " }\n", - " \n", - " p <- ggplot(df_year, aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) +\n", - " geom_tile(color = \"white\", linewidth = 0.2) +\n", - " geom_text(aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " size = 5, fontface = \"bold\", color = \"white\") +\n", - " scale_fill_viridis(name = \"% cohérent\", limits = c(0, 100),\n", - " option = \"viridis\", direction = -1) +\n", - " labs(\n", - " title = paste0(\"Cohérence des données par \", agg_label, \" - \", selected_year),\n", - " x = \"Règle de cohérence\",\n", - " y = agg_label\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " axis.text.y = element_text(size = 12),\n", - " axis.text.x = element_text(size = 12, angle = 30, hjust = 1),\n", - " plot.title = element_text(size = 16, face = \"bold\", hjust = 0.5),\n", - " legend.title = element_text(size = 12),\n", - " legend.text = element_text(size = 10)\n", - " )\n", - " \n", - " # Adjust notebook display\n", - " options(repr.plot.width = 14, repr.plot.height = plot_height)\n", - " \n", - " # Save if filename is provided\n", - " if (!is.null(filename)) {\n", - " ggsave(filename = filename, plot = p,\n", - " width = 14, height = plot_height, dpi = 300,\n", - " limitsize = FALSE)\n", - " }\n", - " if (do_plot) { print(p) }\n", - " # return(p)\n", - "}" + "# Coherence heatmap helper loaded from utils/reporting_utils.R" ] }, { @@ -876,43 +738,7 @@ }, "outputs": [], "source": [ - "# Define function\n", - "plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) {\n", - " \n", - " # Check if column exists\n", - " if (!col_name %in% names(map_data)) {\n", - " stop(paste0(\"Column '\", col_name, \"' not found in the data!\"))\n", - " }\n", - " \n", - " # Default legend title if not provided\n", - " if (is.null(indicator_label)) {\n", - " indicator_label <- col_name\n", - " }\n", - " \n", - " ggplot(map_data) +\n", - " geom_sf(aes(fill = .data[[col_name]]), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis(\n", - " name = paste0(\"% cohérence\\n(\", indicator_label, \")\"),\n", - " option = \"magma\",\n", - " direction = -1,\n", - " limits = c(0, 100),\n", - " na.value = \"grey90\"\n", - " ) +\n", - " # facet_wrap(~ YEAR) +\n", - " facet_wrap(~ YEAR, drop = TRUE) +\n", - " labs(\n", - " title = \"Cohérence des données par niveau administratif 2 et par année\",\n", - " subtitle = paste(\"Indicateur :\", indicator_label),\n", - " caption = \"Source : DHIS2 données routinières\"\n", - " ) +\n", - " theme_minimal(base_size = 15) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " strip.text = element_text(size = 14, face = \"bold\"),\n", - " plot.title = element_text(size = 20, face = \"bold\"),\n", - " legend.position = \"right\"\n", - " )\n", - "}\n" + "# Coherence map helper loaded from utils/reporting_utils.R" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R new file mode 100644 index 0000000..8642d85 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R @@ -0,0 +1,45 @@ +# Shared bootstrap for outliers notebooks. +bootstrap_outliers_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R new file mode 100644 index 0000000..72f70e7 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R @@ -0,0 +1,39 @@ +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + dt[, MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + dt[, c("TO_IMPUTE") := NULL] + return(as.data.frame(data.table::copy(dt))) +} + +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R new file mode 100644 index 0000000..719f4f6 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R @@ -0,0 +1,124 @@ +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), + subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", + x = "Année", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 14) +} + +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + if (nrow(df_ind) == 0) { + return(NULL) + } + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Détection des valeurs aberrantes —", ind_name), + subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), + x = "District (ADM2)", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7)) +} + +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!agg_level %in% names(df)) { + stop(paste0("Aggregation level '", agg_level, "' not found in data!")) + } + df_year <- df %>% + dplyr::filter(YEAR == selected_year) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% + dplyr::summarise(pct_coherent = mean(pct_coherent, na.rm = TRUE), .groups = "drop") %>% + dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% + dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% + dplyr::ungroup() %>% + dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) + n_units <- dplyr::n_distinct(df_year[[agg_level]]) + plot_height <- max(6, 0.5 * n_units) + agg_label <- if (agg_level == "ADM1_NAME") "niveau administratif 1" else if (agg_level == "ADM2_NAME") "niveau administratif 2" else agg_level + p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + + ggplot2::geom_tile(color = "white", linewidth = 0.2) + + ggplot2::geom_text(ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), size = 5, fontface = "bold", color = "white") + + viridis::scale_fill_viridis(name = "% cohérent", limits = c(0, 100), option = "viridis", direction = -1) + + ggplot2::labs(title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), x = "Règle de cohérence", y = agg_label) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + axis.text.y = ggplot2::element_text(size = 12), + axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), + plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), + legend.title = ggplot2::element_text(size = 12), + legend.text = ggplot2::element_text(size = 10) + ) + options(repr.plot.width = 14, repr.plot.height = plot_height) + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = plot_height, dpi = 300, limitsize = FALSE) + } + if (do_plot) { + print(p) + } +} + +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!col_name %in% names(map_data)) { + stop(paste0("Column '", col_name, "' not found in the data!")) + } + if (is.null(indicator_label)) { + indicator_label <- col_name + } + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + + viridis::scale_fill_viridis( + name = paste0("% cohérence\n(", indicator_label, ")"), + option = "magma", + direction = -1, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::facet_wrap(~ YEAR, drop = TRUE) + + ggplot2::labs( + title = "Cohérence des données par niveau administratif 2 et par année", + subtitle = paste("Indicateur :", indicator_label), + caption = "Source : DHIS2 données routinières" + ) + + ggplot2::theme_minimal(base_size = 15) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + strip.text = ggplot2::element_text(size = 14, face = "bold"), + plot.title = ggplot2::element_text(size = 20, face = "bold"), + legend.position = "right" + ) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb index 90275b5..4c81480 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb @@ -59,24 +59,22 @@ "source": [ "# Project folders (ROOT_PATH injected by pipeline if not set)\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_median\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", + ")\n", "\n", - "# Load libraries \n", - "required_packages <- c( \"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Pipeline-specific helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" ] }, { @@ -120,14 +118,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - " })\n", - "\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "log_msg(glue(\"SNT configuration loaded from : {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" ] }, @@ -533,23 +525,8 @@ }, "outputs": [], "source": [ - "# Define helper function to compute moving average for an outlier column\n", - "start_time <- Sys.time()\n", - "\n", - "impute_outliers_dt <- function(dt, outlier_col) {\n", - " dt <- as.data.table(dt) # transform to datatable\n", - " setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) \n", - " dt[, TO_IMPUTE := fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] # Compute TO_IMPUTE column\n", - " \n", - " # Fast rolling mean by group\n", - " dt[, MOVING_AVG := frollapply(TO_IMPUTE, n = 3, FUN = function(x) ceiling(mean(x, na.rm = TRUE)), align = \"center\"), \n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)]\n", - " \n", - " dt[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] \n", - " dt[, c(\"TO_IMPUTE\") := NULL] # clean up \"MOVING_AVG\"\n", - " \n", - " return(as.data.frame(copy(dt)))\n", - "}" + "# Helper loaded from utils/imputation_utils.R\n", + "start_time <- Sys.time()" ] }, { @@ -626,24 +603,7 @@ }, "outputs": [], "source": [ - "# Define helper function to format both versions \n", - "format_routine_data_selection <- function(df, outlier_column, remove = FALSE) {\n", - " \n", - " # remove outliers \n", - " if (remove) df <- df %>% filter(!.data[[outlier_column]])\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", DHIS2_INDICATORS)\n", - " \n", - " output <- df %>%\n", - " select(-VALUE) %>%\n", - " rename(VALUE = VALUE_IMPUTED) %>%\n", - " select(all_of(fixed_cols), INDICATOR, VALUE) %>% # global: fixed_cols\n", - " mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>%\n", - " pivot_wider(names_from = \"INDICATOR\", values_from = \"VALUE\") %>%\n", - " left_join(pyramid_names, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"))\n", - "\n", - " output %>% select(all_of(intersect(target_cols, names(output))))\n", - "}" + "# Helper loaded from utils/imputation_utils.R" ] }, { @@ -658,8 +618,22 @@ "outputs": [], "source": [ "# Format median tables (imputed and removed)\n", - "dhis2_routine_median_imputed <- format_routine_data_selection(dhis2_routine_outliers_median_imputed, median_column)\n", - "dhis2_routine_median_removed <- format_routine_data_selection(dhis2_routine_outliers_median_imputed, median_column, remove = TRUE)" + "dhis2_routine_median_imputed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_median_imputed,\n", + " outlier_column = median_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names\n", + ")\n", + "\n", + "dhis2_routine_median_removed <- format_routine_data_selection(\n", + " df = dhis2_routine_outliers_median_imputed,\n", + " outlier_column = median_column,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " fixed_cols = fixed_cols,\n", + " pyramid_names = pyramid_names,\n", + " remove = TRUE\n", + ")" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb index 4705ad1..04b02b6 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb @@ -35,22 +35,21 @@ "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_median\")\n", "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + "# Shared bootstrap for this pipeline\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_outliers_context(\n", + " root_path = SNT_ROOT_PATH,\n", + " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", + ")\n", "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "openhexa <- setup_ctx$openhexa\n", "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "# Reporting helpers\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" ] }, { @@ -64,13 +63,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", + "# Load SNT config from bootstrap context\n", + "config_json <- setup_ctx$config_json\n", "\n", "# Configuration variables\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", @@ -94,10 +88,7 @@ }, "outputs": [], "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" + "# Helper loaded from utils/reporting_utils.R" ] }, { @@ -231,76 +222,9 @@ }, "outputs": [], "source": [ - "#--- FUNCTIONS TO MAKE ONE PLOT ---\n", - "plot_outliers <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>% filter(INDICATOR == ind_name)\n", - "\n", - " # Remove infinite or impossible values explicitly → removes warnings\n", - " df_ind <- df_ind %>% \n", - " filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE))\n", - "\n", - " p <- ggplot(df_ind, aes(x = YEAR, y = VALUE)) +\n", - " \n", - " # All values (grey)\n", - " geom_point(alpha = 0.25, color = \"grey40\", na.rm = TRUE) +\n", - " \n", - " # Outliers (red)\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " aes(x = YEAR, y = VALUE),\n", - " color = \"red\",\n", - " size = 2.8,\n", - " alpha = 0.85,\n", - " na.rm = TRUE\n", - " ) +\n", - " \n", - " labs(\n", - " title = paste(\"Inspection des valeurs aberrantes pour indicateur:\", ind_name),\n", - " subtitle = \"Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées\",\n", - " x = \"Année\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 14)\n", - "\n", - " return(p)\n", - "}\n", - "\n", - "#plots <- map(unique_inds, ~ plot_outliers(.x, routine_data, outlier_col))\n", - "#walk(plots, print)\n", - "\n", - "plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) {\n", - " \n", - " df_ind <- df %>%\n", - " filter(\n", - " INDICATOR == ind_name,\n", - " !is.na(YEAR),\n", - " !is.na(VALUE),\n", - " is.finite(VALUE)\n", - " )\n", - " \n", - " if (nrow(df_ind) == 0) return(NULL)\n", - " \n", - " ggplot(df_ind, aes(x = ADM2_ID, y = VALUE)) +\n", - " geom_point(color = \"grey60\", alpha = 0.3) +\n", - " geom_point(\n", - " data = df_ind %>% filter(.data[[outlier_col]] == TRUE),\n", - " color = \"red\", \n", - " size = 2.8,\n", - " alpha = 0.85\n", - " ) +\n", - " facet_wrap(~ YEAR, scales = \"free_y\") +\n", - " labs(\n", - " title = paste(\"Détection des valeurs aberrantes —\", ind_name),\n", - " subtitle = paste(\"Méthode :\", outlier_col, \"| Rouge = valeur aberrante\"),\n", - " x = \"District (ADM2)\",\n", - " y = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 75, hjust = 1, size = 7)\n", - " )\n", - "}" + "# Plot helpers loaded from utils/reporting_utils.R\n", + "# - plot_outliers()\n", + "# - plot_outliers_by_district_facet_year()" ] }, { @@ -747,69 +671,7 @@ }, "outputs": [], "source": [ - "# Define heatmap function\n", - "plot_coherence_heatmap <- function(df, selected_year, agg_level = \"ADM1_NAME\", filename = NULL, do_plot = TRUE) {\n", - " \n", - " if (!agg_level %in% names(df)) {\n", - " stop(paste0(\"Aggregation level '\", agg_level, \"' not found in data!\"))\n", - " }\n", - " \n", - " # Aggregate pct_coherent by chosen level + check_label\n", - " df_year <- df %>%\n", - " filter(YEAR == selected_year) %>%\n", - " group_by(across(all_of(c(agg_level, \"check_label\")))) %>%\n", - " summarise(\n", - " pct_coherent = mean(pct_coherent, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " group_by(across(all_of(agg_level))) %>%\n", - " mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>%\n", - " ungroup() %>%\n", - " mutate(!!agg_level := fct_reorder(.data[[agg_level]], median_coh))\n", - " \n", - " n_units <- n_distinct(df_year[[agg_level]])\n", - " plot_height <- max(6, 0.5 * n_units) # dynamically adjust height\n", - " agg_label <- if (agg_level == \"ADM1_NAME\") {\n", - " \"niveau administratif 1\"\n", - " } else if (agg_level == \"ADM2_NAME\") {\n", - " \"niveau administratif 2\"\n", - " } else {\n", - " agg_level # fallback, in case a different level is passed\n", - " }\n", - " \n", - " p <- ggplot(df_year, aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) +\n", - " geom_tile(color = \"white\", linewidth = 0.2) +\n", - " geom_text(aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " size = 5, fontface = \"bold\", color = \"white\") +\n", - " scale_fill_viridis(name = \"% cohérent\", limits = c(0, 100),\n", - " option = \"viridis\", direction = -1) +\n", - " labs(\n", - " title = paste0(\"Cohérence des données par \", agg_label, \" - \", selected_year),\n", - " x = \"Règle de cohérence\",\n", - " y = agg_label\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " axis.text.y = element_text(size = 12),\n", - " axis.text.x = element_text(size = 12, angle = 30, hjust = 1),\n", - " plot.title = element_text(size = 16, face = \"bold\", hjust = 0.5),\n", - " legend.title = element_text(size = 12),\n", - " legend.text = element_text(size = 10)\n", - " )\n", - " \n", - " # Adjust notebook display\n", - " options(repr.plot.width = 14, repr.plot.height = plot_height)\n", - " \n", - " # Save if filename is provided\n", - " if (!is.null(filename)) {\n", - " ggsave(filename = filename, plot = p,\n", - " width = 14, height = plot_height, dpi = 300,\n", - " limitsize = FALSE)\n", - " }\n", - " if (do_plot) { print(p) }\n", - " # return(p)\n", - "}" + "# Coherence heatmap helper loaded from utils/reporting_utils.R" ] }, { @@ -876,43 +738,7 @@ }, "outputs": [], "source": [ - "# Define function\n", - "plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) {\n", - " \n", - " # Check if column exists\n", - " if (!col_name %in% names(map_data)) {\n", - " stop(paste0(\"Column '\", col_name, \"' not found in the data!\"))\n", - " }\n", - " \n", - " # Default legend title if not provided\n", - " if (is.null(indicator_label)) {\n", - " indicator_label <- col_name\n", - " }\n", - " \n", - " ggplot(map_data) +\n", - " geom_sf(aes(fill = .data[[col_name]]), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis(\n", - " name = paste0(\"% cohérence\\n(\", indicator_label, \")\"),\n", - " option = \"magma\",\n", - " direction = -1,\n", - " limits = c(0, 100),\n", - " na.value = \"grey90\"\n", - " ) +\n", - " # facet_wrap(~ YEAR) +\n", - " facet_wrap(~ YEAR, drop = TRUE) +\n", - " labs(\n", - " title = \"Cohérence des données par niveau administratif 2 et par année\",\n", - " subtitle = paste(\"Indicateur :\", indicator_label),\n", - " caption = \"Source : DHIS2 données routinières\"\n", - " ) +\n", - " theme_minimal(base_size = 15) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " strip.text = element_text(size = 14, face = \"bold\"),\n", - " plot.title = element_text(size = 20, face = \"bold\"),\n", - " legend.position = \"right\"\n", - " )\n", - "}\n" + "# Coherence map helper loaded from utils/reporting_utils.R" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R new file mode 100644 index 0000000..8642d85 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R @@ -0,0 +1,45 @@ +# Shared bootstrap for outliers notebooks. +bootstrap_outliers_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R new file mode 100644 index 0000000..72f70e7 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R @@ -0,0 +1,39 @@ +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + dt[, MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + dt[, c("TO_IMPUTE") := NULL] + return(as.data.frame(data.table::copy(dt))) +} + +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R new file mode 100644 index 0000000..719f4f6 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R @@ -0,0 +1,124 @@ +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), + subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", + x = "Année", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 14) +} + +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + if (nrow(df_ind) == 0) { + return(NULL) + } + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Détection des valeurs aberrantes —", ind_name), + subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), + x = "District (ADM2)", + y = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7)) +} + +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!agg_level %in% names(df)) { + stop(paste0("Aggregation level '", agg_level, "' not found in data!")) + } + df_year <- df %>% + dplyr::filter(YEAR == selected_year) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% + dplyr::summarise(pct_coherent = mean(pct_coherent, na.rm = TRUE), .groups = "drop") %>% + dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% + dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% + dplyr::ungroup() %>% + dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) + n_units <- dplyr::n_distinct(df_year[[agg_level]]) + plot_height <- max(6, 0.5 * n_units) + agg_label <- if (agg_level == "ADM1_NAME") "niveau administratif 1" else if (agg_level == "ADM2_NAME") "niveau administratif 2" else agg_level + p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + + ggplot2::geom_tile(color = "white", linewidth = 0.2) + + ggplot2::geom_text(ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), size = 5, fontface = "bold", color = "white") + + viridis::scale_fill_viridis(name = "% cohérent", limits = c(0, 100), option = "viridis", direction = -1) + + ggplot2::labs(title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), x = "Règle de cohérence", y = agg_label) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + axis.text.y = ggplot2::element_text(size = 12), + axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), + plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), + legend.title = ggplot2::element_text(size = 12), + legend.text = ggplot2::element_text(size = 10) + ) + options(repr.plot.width = 14, repr.plot.height = plot_height) + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = plot_height, dpi = 300, limitsize = FALSE) + } + if (do_plot) { + print(p) + } +} + +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!col_name %in% names(map_data)) { + stop(paste0("Column '", col_name, "' not found in the data!")) + } + if (is.null(indicator_label)) { + indicator_label <- col_name + } + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + + viridis::scale_fill_viridis( + name = paste0("% cohérence\n(", indicator_label, ")"), + option = "magma", + direction = -1, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::facet_wrap(~ YEAR, drop = TRUE) + + ggplot2::labs( + title = "Cohérence des données par niveau administratif 2 et par année", + subtitle = paste("Indicateur :", indicator_label), + caption = "Source : DHIS2 données routinières" + ) + + ggplot2::theme_minimal(base_size = 15) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + strip.text = ggplot2::element_text(size = 14, face = "bold"), + plot.title = ggplot2::element_text(size = 20, face = "bold"), + legend.position = "right" + ) +} From 068efb9205c578e3d06304bcdbf2d6ac345bb101 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 24 Mar 2026 15:45:21 +0100 Subject: [PATCH 03/23] Refactor Magic Glasses notebooks to utilize a shared bootstrap context for configuration and package management. Introduced utility functions for outlier detection, enhancing modularity and readability. Updated paths for loading configurations and improved error handling in reporting notebooks. --- ...s2_outliers_imputation_magic_glasses.ipynb | 178 +++++------------- ...iers_imputation_magic_glasses_report.ipynb | 92 ++++----- .../utils/bootstrap.R | 28 +++ .../utils/magic_glasses_utils.R | 97 ++++++++++ 4 files changed, 223 insertions(+), 172 deletions(-) create mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R create mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index 60cfa92..c3b5b3a 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "4e1fe23c", "metadata": {}, "source": [ "# Outliers Detection - Magic Glasses (MG)\n", @@ -10,18 +9,16 @@ "Dedicated notebook for MG outlier detection only:\n", "- `OUTLIER_MAGIC_GLASSES_PARTIAL` (MAD15 -> MAD10)\n", "- `OUTLIER_MAGIC_GLASSES_COMPLETE` (MAD15 -> MAD10 -> seasonal5 -> seasonal3)" - ] + ], + "id": "4e1fe23c" }, { "cell_type": "code", - "execution_count": null, - "id": "1ddc1fb2", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Parameters with safe defaults\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", @@ -50,20 +47,21 @@ "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", "OUTPUT_DIR <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", "dir.create(OUTPUT_DIR, recursive = TRUE, showWarnings = FALSE)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1ddc1fb2" }, { "cell_type": "code", - "execution_count": null, - "id": "c91aab68", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", "\n", "required_packages <- c(\"arrow\", \"data.table\", \"jsonlite\", \"reticulate\", \"glue\")\n", "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", @@ -72,10 +70,18 @@ "if (RUN_MAGIC_GLASSES_COMPLETE && SEASONAL_WORKERS > 1) {\n", " required_packages <- c(required_packages, \"future\", \"future.apply\")\n", "}\n", - "install_and_load(unique(required_packages))\n", "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", + "setup_ctx <- bootstrap_magic_glasses_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = required_packages\n", + ")\n", + "\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "openhexa <- setup_ctx$openhexa\n", + "\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"magic_glasses_utils.R\"))\n", "\n", "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", " log_msg(\"[WARNING] Complete mode: seasonal detection is very computationally intensive and can take several hours to run.\", \"warning\")\n", @@ -121,104 +127,34 @@ "} else {\n", " log_msg(\"Partial mode active: seasonal detection is skipped.\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "c91aab68" }, { "cell_type": "code", - "execution_count": null, - "id": "652f7a2e", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ - "detect_outliers_mad_custom <- function(dt, deviation) {\n", - " flag_col <- paste0(\"OUTLIER_MAD\", deviation)\n", - " dt <- copy(dt)\n", - " dt[, median_val := median(VALUE, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)]\n", - " dt[, mad_val := mad(VALUE, constant = 1, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)]\n", - " dt[, (flag_col) := (VALUE > (median_val + deviation * mad_val)) | (VALUE < (median_val - deviation * mad_val))]\n", - " dt[is.na(get(flag_col)), (flag_col) := FALSE]\n", - " dt[, c(\"median_val\", \"mad_val\") := NULL]\n", - " dt\n", - "}\n", - "\n", - "detect_seasonal_outliers <- function(dt, deviation, workers = 1) {\n", - " outlier_col <- paste0(\"OUTLIER_SEASONAL\", deviation)\n", - " dt <- copy(dt)\n", - " setorder(dt, OU_ID, INDICATOR, PERIOD)\n", - "\n", - " process_group <- function(sub_dt) {\n", - " n_valid <- sum(!is.na(sub_dt$VALUE))\n", - " if (n_valid < 2) {\n", - " return(data.table(\n", - " PERIOD = sub_dt$PERIOD,\n", - " OU_ID = sub_dt$OU_ID,\n", - " INDICATOR = sub_dt$INDICATOR,\n", - " OUTLIER_FLAG = rep(FALSE, nrow(sub_dt))\n", - " ))\n", - " }\n", - "\n", - " values <- as.numeric(sub_dt$VALUE)\n", - " ts_data <- stats::ts(values, frequency = 12)\n", - " cleaned_ts <- tryCatch(\n", - " forecast::tsclean(ts_data, replace.missing = TRUE),\n", - " error = function(e) ts_data\n", - " )\n", - " mad_val <- mad(values, constant = 1, na.rm = TRUE)\n", - "\n", - " if (is.na(mad_val) || mad_val == 0) {\n", - " return(data.table(\n", - " PERIOD = sub_dt$PERIOD,\n", - " OU_ID = sub_dt$OU_ID,\n", - " INDICATOR = sub_dt$INDICATOR,\n", - " OUTLIER_FLAG = rep(FALSE, nrow(sub_dt))\n", - " ))\n", - " }\n", - "\n", - " is_outlier <- abs(as.numeric(ts_data) - as.numeric(cleaned_ts)) / mad_val >= deviation\n", - " is_outlier[is.na(is_outlier)] <- FALSE\n", - "\n", - " data.table(\n", - " PERIOD = sub_dt$PERIOD,\n", - " OU_ID = sub_dt$OU_ID,\n", - " INDICATOR = sub_dt$INDICATOR,\n", - " OUTLIER_FLAG = as.logical(is_outlier)\n", - " )\n", - " }\n", - "\n", - " group_keys <- unique(dt[, .(OU_ID, INDICATOR)])\n", - " group_list <- lapply(seq_len(nrow(group_keys)), function(i) {\n", - " dt[OU_ID == group_keys$OU_ID[i] & INDICATOR == group_keys$INDICATOR[i]]\n", - " })\n", - "\n", - " if (workers > 1 && requireNamespace(\"future.apply\", quietly = TRUE)) {\n", - " result_list <- future.apply::future_lapply(group_list, process_group, future.seed = TRUE)\n", - " } else {\n", - " result_list <- lapply(group_list, process_group)\n", - " }\n", - "\n", - " outlier_flags <- rbindlist(result_list, use.names = TRUE)\n", - " setnames(outlier_flags, \"OUTLIER_FLAG\", outlier_col)\n", - "\n", - " result_dt <- merge(dt, outlier_flags, by = c(\"PERIOD\", \"OU_ID\", \"INDICATOR\"), all.x = TRUE)\n", - " result_dt[is.na(get(outlier_col)), (outlier_col) := FALSE]\n", - " result_dt\n", - "}" - ] + "# Helpers loaded from utils/magic_glasses_utils.R\n", + "# - detect_outliers_mad_custom()\n", + "# - detect_seasonal_outliers()" + ], + "execution_count": null, + "outputs": [], + "id": "652f7a2e" }, { "cell_type": "code", - "execution_count": null, - "id": "1720f1f3", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "if (RUN_MAGIC_GLASSES_PARTIAL | RUN_MAGIC_GLASSES_COMPLETE) {\n", " log_msg(\"Starting MAD15 detection...\")\n", @@ -286,18 +222,18 @@ " flagged_outliers_seasonal5_seasonal3[is.na(OUTLIER_SEASONAL5_SEASONAL3), OUTLIER_SEASONAL5_SEASONAL3 := TRUE]\n", " log_msg(glue::glue(\"SEASONAL complete done: {sum(flagged_outliers_seasonal5_seasonal3$OUTLIER_SEASONAL5_SEASONAL3)} outliers flagged\"))\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1720f1f3" }, { "cell_type": "code", - "execution_count": null, - "id": "d6adc76b", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "base_cols <- intersect(c(fixed_cols, \"INDICATOR\", \"VALUE\"), names(dhis2_routine_long))\n", "flagged_outliers_mg <- copy(dhis2_routine_long[, ..base_cols])\n", @@ -354,31 +290,6 @@ "n_out <- sum(detected_tbl$OUTLIER_DETECTED == TRUE)\n", "log_msg(glue::glue(\"Exported full detection table ({nrow(detected_tbl)} rows, {n_out} outliers) to {COUNTRY_CODE}_routine_outliers_detected.parquet\"))\n", "\n", - "# Helper to restore routine dataset format (same structure as other outlier pipelines)\n", - "to_routine_wide <- function(dt_long) {\n", - " routine_wide <- dcast(\n", - " dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)],\n", - " PERIOD + YEAR + MONTH + ADM1_ID + ADM2_ID + OU_ID ~ INDICATOR,\n", - " value.var = \"VALUE\"\n", - " )\n", - "\n", - " routine_wide <- merge(routine_wide, unique(pyramid_names), by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"), all.x = TRUE)\n", - "\n", - " target_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"OU_ID\", \"OU_NAME\", indicators_to_keep)\n", - " for (col in setdiff(target_cols, names(routine_wide))) {\n", - " if (col %in% indicators_to_keep) {\n", - " routine_wide[, (col) := NA_real_]\n", - " } else if (col %in% c(\"YEAR\", \"MONTH\")) {\n", - " routine_wide[, (col) := NA_integer_]\n", - " } else {\n", - " routine_wide[, (col) := NA_character_]\n", - " }\n", - " }\n", - " cols_to_keep <- intersect(target_cols, names(routine_wide))\n", - " routine_wide <- routine_wide[, ..cols_to_keep]\n", - " routine_wide\n", - "}\n", - "\n", "# 2) Imputed routine data (same moving-average logic as other outlier pipelines)\n", "imputed_long <- copy(flagged_outliers_mg)\n", "setorder(imputed_long, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH)\n", @@ -396,7 +307,12 @@ "imputed_long[, VALUE := VALUE_IMPUTED]\n", "imputed_long[, c(\"TO_IMPUTE\", \"MOVING_AVG\", \"VALUE_IMPUTED\") := NULL]\n", "\n", - "routine_imputed <- to_routine_wide(imputed_long)\n", + "routine_imputed <- to_routine_wide(\n", + " dt_long = imputed_long,\n", + " fixed_cols = fixed_cols,\n", + " indicators_to_keep = indicators_to_keep,\n", + " pyramid_names = pyramid_names\n", + ")\n", "arrow::write_parquet(routine_imputed, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_imputed.parquet\")))\n", "log_msg(glue::glue(\"Exported routine imputed table to {COUNTRY_CODE}_routine_outliers_imputed.parquet\"))\n", "\n", @@ -405,12 +321,20 @@ "removed_long <- copy(flagged_outliers_mg)\n", "removed_long[get(active_outlier_col) == TRUE, VALUE := NA_real_]\n", "\n", - "routine_removed <- to_routine_wide(removed_long)\n", + "routine_removed <- to_routine_wide(\n", + " dt_long = removed_long,\n", + " fixed_cols = fixed_cols,\n", + " indicators_to_keep = indicators_to_keep,\n", + " pyramid_names = pyramid_names\n", + ")\n", "arrow::write_parquet(routine_removed, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_removed.parquet\")))\n", "log_msg(glue::glue(\"Exported routine removed table to {COUNTRY_CODE}_routine_outliers_removed.parquet\"))\n", "\n", "log_msg(\"MG outlier tables exported successfully.\")" - ] + ], + "execution_count": null, + "outputs": [], + "id": "d6adc76b" } ], "metadata": { @@ -430,4 +354,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb index 4751905..643cfa5 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "c254a43d", "metadata": {}, "source": [ "# MG Outliers Detection Report\n", @@ -13,31 +12,31 @@ "- **Méthode complète** : MAD15 → MAD10 → seasonal5 → seasonal3 (outliers complets).\n", "\n", "Les sections ci-dessous résument le nombre d’outliers détectés et leur répartition par indicateur et par année." - ] + ], + "id": "c254a43d" }, { "cell_type": "code", - "execution_count": null, - "id": "de9e854f", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Setup\n", "ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\", \"dhis2\", \"outliers_imputation\")\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "install_and_load(c(\"jsonlite\", \"arrow\", \"glue\", \"reticulate\", \"dplyr\", \"ggplot2\", \"knitr\", \"scales\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "setup_ctx <- bootstrap_magic_glasses_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"jsonlite\", \"arrow\", \"glue\", \"reticulate\", \"dplyr\", \"ggplot2\", \"knitr\", \"scales\")\n", + ")\n", "\n", - "# Align logging init with other production notebooks\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", + "CODE_PATH <- setup_ctx$CODE_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- file.path(setup_ctx$DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", + "openhexa <- setup_ctx$openhexa\n", "\n", "config_json <- fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", @@ -75,26 +74,26 @@ "} else {\n", " log_msg(\"MG detected outlier file not found.\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "de9e854f" }, { "cell_type": "markdown", - "id": "181df7b3", "metadata": {}, "source": [ "## 1. Résumé des outliers détectés" - ] + ], + "id": "181df7b3" }, { "cell_type": "code", - "execution_count": null, - "id": "f8c790fd", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Tableau récapitulatif (totaux)\n", "summary_totals <- data.frame(\n", @@ -105,26 +104,26 @@ " )\n", ")\n", "knitr::kable(summary_totals, format = \"simple\", col.names = c(\"Méthode\", \"Nombre d'outliers\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f8c790fd" }, { "cell_type": "markdown", - "id": "14eb3d09", "metadata": {}, "source": [ "## 2. Répartition par indicateur" - ] + ], + "id": "14eb3d09" }, { "cell_type": "code", - "execution_count": null, - "id": "71e65233", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Nombre d'outliers par indicateur (méthode partielle)\n", "if (!is.null(partial_tbl) && nrow(partial_tbl) > 0 && \"INDICATOR\" %in% names(partial_tbl)) {\n", @@ -136,18 +135,18 @@ "} else {\n", " cat(\"Aucune donnée partielle disponible.\\n\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "71e65233" }, { "cell_type": "code", - "execution_count": null, - "id": "3223c9b2", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Nombre d'outliers par indicateur (méthode complète)\n", "if (!is.null(complete_tbl) && nrow(complete_tbl) > 0 && \"INDICATOR\" %in% names(complete_tbl)) {\n", @@ -159,18 +158,18 @@ "} else {\n", " cat(\"Aucune donnée complète disponible (ou méthode complète non exécutée).\\n\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "3223c9b2" }, { "cell_type": "code", - "execution_count": null, - "id": "39239983", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Graphique : répartition par indicateur (partiel et/ou complet)\n", "plot_df <- NULL\n", @@ -204,26 +203,26 @@ "} else {\n", " cat(\"Aucune donnée à afficher.\\n\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "39239983" }, { "cell_type": "markdown", - "id": "28666b74", "metadata": {}, "source": [ "## 3. Évolution par année" - ] + ], + "id": "28666b74" }, { "cell_type": "code", - "execution_count": null, - "id": "1b1fcd6d", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Tableau : nombre d'outliers par année (partiel et/ou complet)\n", "year_tab <- NULL\n", @@ -239,18 +238,18 @@ "} else {\n", " cat(\"Aucune donnée par année disponible.\\n\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1b1fcd6d" }, { "cell_type": "code", - "execution_count": null, - "id": "1747b5b6", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Graphique : évolution du nombre d'outliers par année\n", "year_df <- NULL\n", @@ -281,7 +280,10 @@ "} else {\n", " cat(\"Aucune donnée par année à afficher.\\n\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1747b5b6" } ], "metadata": { @@ -301,4 +303,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R new file mode 100644 index 0000000..3e1d63d --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R @@ -0,0 +1,28 @@ +# Shared bootstrap for Magic Glasses notebooks. +bootstrap_magic_glasses_context <- function( + root_path = "~/workspace", + required_packages = c("arrow", "data.table", "jsonlite", "reticulate", "glue"), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(unique(required_packages)) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa + )) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R new file mode 100644 index 0000000..ce57811 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R @@ -0,0 +1,97 @@ +detect_outliers_mad_custom <- function(dt, deviation) { + flag_col <- paste0("OUTLIER_MAD", deviation) + dt <- data.table::copy(dt) + dt[, median_val := median(VALUE, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] + dt[, mad_val := mad(VALUE, constant = 1, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] + dt[, (flag_col) := (VALUE > (median_val + deviation * mad_val)) | (VALUE < (median_val - deviation * mad_val))] + dt[is.na(get(flag_col)), (flag_col) := FALSE] + dt[, c("median_val", "mad_val") := NULL] + dt +} + +detect_seasonal_outliers <- function(dt, deviation, workers = 1) { + outlier_col <- paste0("OUTLIER_SEASONAL", deviation) + dt <- data.table::copy(dt) + data.table::setorder(dt, OU_ID, INDICATOR, PERIOD) + + process_group <- function(sub_dt) { + n_valid <- sum(!is.na(sub_dt$VALUE)) + if (n_valid < 2) { + return(data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) + )) + } + + values <- as.numeric(sub_dt$VALUE) + ts_data <- stats::ts(values, frequency = 12) + cleaned_ts <- tryCatch( + forecast::tsclean(ts_data, replace.missing = TRUE), + error = function(e) ts_data + ) + mad_val <- mad(values, constant = 1, na.rm = TRUE) + + if (is.na(mad_val) || mad_val == 0) { + return(data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) + )) + } + + is_outlier <- abs(as.numeric(ts_data) - as.numeric(cleaned_ts)) / mad_val >= deviation + is_outlier[is.na(is_outlier)] <- FALSE + + data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = as.logical(is_outlier) + ) + } + + group_keys <- unique(dt[, .(OU_ID, INDICATOR)]) + group_list <- lapply(seq_len(nrow(group_keys)), function(i) { + dt[OU_ID == group_keys$OU_ID[i] & INDICATOR == group_keys$INDICATOR[i]] + }) + + if (workers > 1 && requireNamespace("future.apply", quietly = TRUE)) { + result_list <- future.apply::future_lapply(group_list, process_group, future.seed = TRUE) + } else { + result_list <- lapply(group_list, process_group) + } + + outlier_flags <- data.table::rbindlist(result_list, use.names = TRUE) + data.table::setnames(outlier_flags, "OUTLIER_FLAG", outlier_col) + + result_dt <- merge(dt, outlier_flags, by = c("PERIOD", "OU_ID", "INDICATOR"), all.x = TRUE) + result_dt[is.na(get(outlier_col)), (outlier_col) := FALSE] + result_dt +} + +to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_names) { + routine_wide <- data.table::dcast( + dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)], + PERIOD + YEAR + MONTH + ADM1_ID + ADM2_ID + OU_ID ~ INDICATOR, + value.var = "VALUE" + ) + + routine_wide <- merge(routine_wide, unique(pyramid_names), by = c("ADM1_ID", "ADM2_ID", "OU_ID"), all.x = TRUE) + + target_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", indicators_to_keep) + for (col in setdiff(target_cols, names(routine_wide))) { + if (col %in% indicators_to_keep) { + routine_wide[, (col) := NA_real_] + } else if (col %in% c("YEAR", "MONTH")) { + routine_wide[, (col) := NA_integer_] + } else { + routine_wide[, (col) := NA_character_] + } + } + cols_to_keep <- intersect(target_cols, names(routine_wide)) + routine_wide <- routine_wide[, ..cols_to_keep] + routine_wide +} From a2cb550bcfecb4dfa95cd90909bba5bf53af13cb Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 24 Mar 2026 17:32:13 +0100 Subject: [PATCH 04/23] Enhance SNT utility functions for configuration loading and data processing. Introduced error handling for configuration loading, validation of required keys, and standardized output path creation. Updated outlier imputation notebooks to utilize these new utilities, improving modularity and readability. --- code/snt_utils.r | 108 ++++++++++++++++++ ...s2_outliers_imputation_magic_glasses.ipynb | 46 ++++---- ...iers_imputation_magic_glasses_report.ipynb | 74 ++++++------ .../snt_dhis2_outliers_imputation_mean.ipynb | 67 +++++------ 4 files changed, 199 insertions(+), 96 deletions(-) diff --git a/code/snt_utils.r b/code/snt_utils.r index 8efda14..838e276 100644 --- a/code/snt_utils.r +++ b/code/snt_utils.r @@ -58,6 +58,114 @@ install_and_load <- function(packages) { print(loaded_packages) } +# Load SNT configuration file with a consistent error message +load_snt_config <- function(config_path, config_file_name = "SNT_config.json") { + config_file <- file.path(config_path, config_file_name) + config_json <- tryCatch( + { + jsonlite::fromJSON(config_file) + }, + error = function(e) { + msg <- paste0("[ERROR] Error while loading configuration ", conditionMessage(e)) + stop(msg) + } + ) + return(config_json) +} + +# Validate that required keys exist in a config section +validate_required_config_keys <- function(config_json, keys, section = "SNT_CONFIG") { + if (is.null(config_json[[section]])) { + stop(paste0("[ERROR] Missing configuration section: ", section)) + } + + missing_keys <- keys[!keys %in% names(config_json[[section]])] + if (length(missing_keys) > 0) { + stop(paste0("[ERROR] Missing configuration input(s): ", paste(missing_keys, collapse = ", "))) + } + + invisible(TRUE) +} + +# Generic helper to load a country-specific dataset file +load_country_file_from_dataset <- function(dataset_id, country_code, suffix, label = NULL) { + file_name <- paste0(country_code, suffix) + output_data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_id, file_name) + }, + error = function(e) { + target_label <- if (is.null(label)) file_name else label + msg <- paste0( + "[ERROR] Error while loading ", + target_label, + " (dataset: ", + dataset_id, + ", file: ", + file_name, + "): ", + conditionMessage(e) + ) + stop(msg) + } + ) + + log_msg(paste0("Loaded file `", file_name, "` from dataset `", dataset_id, "`.")) + return(output_data) +} + +# Ensure YEAR and MONTH are stored as integers when present +normalize_year_month_types <- function(input_df, year_col = "YEAR", month_col = "MONTH") { + output_df <- input_df + if (year_col %in% names(output_df)) { + output_df[[year_col]] <- as.integer(output_df[[year_col]]) + } + if (month_col %in% names(output_df)) { + output_df[[month_col]] <- as.integer(output_df[[month_col]]) + } + return(output_df) +} + +# Standard routine preparation: select, pivot longer, optional deduplication +prepare_routine_long <- function(routine_df, fixed_cols, indicators, deduplicate = TRUE) { + cols_to_select <- intersect(c(fixed_cols, indicators), names(routine_df)) + missing_indicators <- setdiff(indicators, names(routine_df)) + if (length(missing_indicators) > 0) { + stop(paste0("[ERROR] Missing indicator column(s): ", paste(missing_indicators, collapse = ", "))) + } + + routine_long <- routine_df %>% + dplyr::select(dplyr::all_of(cols_to_select)) %>% + tidyr::pivot_longer( + cols = dplyr::all_of(indicators), + names_to = "INDICATOR", + values_to = "VALUE" + ) + + if (deduplicate) { + dedup_keys <- intersect(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "YEAR", "MONTH", "INDICATOR"), names(routine_long)) + routine_long <- routine_long %>% + dplyr::distinct(dplyr::across(dplyr::all_of(dedup_keys)), .keep_all = TRUE) + } + + return(routine_long) +} + +# Build a standardized output path under /data and create it if needed +standard_output_path <- function(data_root_path, domain, subdomain = NULL, create_dir = TRUE) { + target_path <- if (is.null(subdomain) || nchar(subdomain) == 0) { + file.path(data_root_path, domain) + } else { + file.path(data_root_path, domain, subdomain) + } + + if (create_dir && !dir.exists(target_path)) { + dir.create(target_path, recursive = TRUE, showWarnings = FALSE) + } + + return(target_path) +} + # Helper to safely extract values from parameters (allows to specify the type) get_param <- function(params_list, target_param, default, cast_method = identity) { #' Safely retrieve a parameter if it exists in the input, using a default fallback if it doesn't exist in the inupt diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index c3b5b3a..78a0083 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "4e1fe23c", "metadata": {}, "source": [ "# Outliers Detection - Magic Glasses (MG)\n", @@ -9,16 +10,18 @@ "Dedicated notebook for MG outlier detection only:\n", "- `OUTLIER_MAGIC_GLASSES_PARTIAL` (MAD15 -> MAD10)\n", "- `OUTLIER_MAGIC_GLASSES_COMPLETE` (MAD15 -> MAD10 -> seasonal5 -> seasonal3)" - ], - "id": "4e1fe23c" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1ddc1fb2", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Parameters with safe defaults\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", @@ -47,18 +50,18 @@ "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", "OUTPUT_DIR <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", "dir.create(OUTPUT_DIR, recursive = TRUE, showWarnings = FALSE)" - ], - "execution_count": null, - "outputs": [], - "id": "1ddc1fb2" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c91aab68", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", @@ -127,34 +130,34 @@ "} else {\n", " log_msg(\"Partial mode active: seasonal detection is skipped.\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "c91aab68" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "652f7a2e", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Helpers loaded from utils/magic_glasses_utils.R\n", "# - detect_outliers_mad_custom()\n", "# - detect_seasonal_outliers()" - ], - "execution_count": null, - "outputs": [], - "id": "652f7a2e" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1720f1f3", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "if (RUN_MAGIC_GLASSES_PARTIAL | RUN_MAGIC_GLASSES_COMPLETE) {\n", " log_msg(\"Starting MAD15 detection...\")\n", @@ -222,18 +225,18 @@ " flagged_outliers_seasonal5_seasonal3[is.na(OUTLIER_SEASONAL5_SEASONAL3), OUTLIER_SEASONAL5_SEASONAL3 := TRUE]\n", " log_msg(glue::glue(\"SEASONAL complete done: {sum(flagged_outliers_seasonal5_seasonal3$OUTLIER_SEASONAL5_SEASONAL3)} outliers flagged\"))\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "1720f1f3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "d6adc76b", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "base_cols <- intersect(c(fixed_cols, \"INDICATOR\", \"VALUE\"), names(dhis2_routine_long))\n", "flagged_outliers_mg <- copy(dhis2_routine_long[, ..base_cols])\n", @@ -331,10 +334,7 @@ "log_msg(glue::glue(\"Exported routine removed table to {COUNTRY_CODE}_routine_outliers_removed.parquet\"))\n", "\n", "log_msg(\"MG outlier tables exported successfully.\")" - ], - "execution_count": null, - "outputs": [], - "id": "d6adc76b" + ] } ], "metadata": { @@ -354,4 +354,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb index 643cfa5..3ceddcb 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "c254a43d", "metadata": {}, "source": [ "# MG Outliers Detection Report\n", @@ -12,16 +13,18 @@ "- **Méthode complète** : MAD15 → MAD10 → seasonal5 → seasonal3 (outliers complets).\n", "\n", "Les sections ci-dessous résument le nombre d’outliers détectés et leur répartition par indicateur et par année." - ], - "id": "c254a43d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "de9e854f", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Setup\n", "ROOT_PATH <- \"~/workspace\"\n", @@ -74,26 +77,26 @@ "} else {\n", " log_msg(\"MG detected outlier file not found.\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "de9e854f" + ] }, { "cell_type": "markdown", + "id": "181df7b3", "metadata": {}, "source": [ "## 1. Résumé des outliers détectés" - ], - "id": "181df7b3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f8c790fd", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Tableau récapitulatif (totaux)\n", "summary_totals <- data.frame(\n", @@ -104,26 +107,26 @@ " )\n", ")\n", "knitr::kable(summary_totals, format = \"simple\", col.names = c(\"Méthode\", \"Nombre d'outliers\"))" - ], - "execution_count": null, - "outputs": [], - "id": "f8c790fd" + ] }, { "cell_type": "markdown", + "id": "14eb3d09", "metadata": {}, "source": [ "## 2. Répartition par indicateur" - ], - "id": "14eb3d09" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "71e65233", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Nombre d'outliers par indicateur (méthode partielle)\n", "if (!is.null(partial_tbl) && nrow(partial_tbl) > 0 && \"INDICATOR\" %in% names(partial_tbl)) {\n", @@ -135,18 +138,18 @@ "} else {\n", " cat(\"Aucune donnée partielle disponible.\\n\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "71e65233" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "3223c9b2", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Nombre d'outliers par indicateur (méthode complète)\n", "if (!is.null(complete_tbl) && nrow(complete_tbl) > 0 && \"INDICATOR\" %in% names(complete_tbl)) {\n", @@ -158,18 +161,18 @@ "} else {\n", " cat(\"Aucune donnée complète disponible (ou méthode complète non exécutée).\\n\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "3223c9b2" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "39239983", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Graphique : répartition par indicateur (partiel et/ou complet)\n", "plot_df <- NULL\n", @@ -203,26 +206,26 @@ "} else {\n", " cat(\"Aucune donnée à afficher.\\n\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "39239983" + ] }, { "cell_type": "markdown", + "id": "28666b74", "metadata": {}, "source": [ "## 3. Évolution par année" - ], - "id": "28666b74" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1b1fcd6d", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Tableau : nombre d'outliers par année (partiel et/ou complet)\n", "year_tab <- NULL\n", @@ -238,18 +241,18 @@ "} else {\n", " cat(\"Aucune donnée par année disponible.\\n\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "1b1fcd6d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1747b5b6", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Graphique : évolution du nombre d'outliers par année\n", "year_df <- NULL\n", @@ -280,10 +283,7 @@ "} else {\n", " cat(\"Aucune donnée par année à afficher.\\n\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "1747b5b6" + ] } ], "metadata": { @@ -303,4 +303,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index 2f73668..62c5d49 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -134,15 +134,12 @@ }, "outputs": [], "source": [ - "# Check SNT configuration \n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") \n", - "for (conf in snt_config_mandatory) {\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", + "# Check SNT configuration (shared helper)\n", + "validate_required_config_keys(\n", + " config_json = config_json,\n", + " keys = c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\"),\n", + " section = \"SNT_CONFIG\"\n", + ")\n", "\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", @@ -180,16 +177,15 @@ }, "outputs": [], "source": [ - "# Load file from dataset (formatting)\n", + "# Load file from dataset (formatting) using shared helper\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", + "dhis2_routine <- load_country_file_from_dataset(\n", + " dataset_id = dataset_name,\n", + " country_code = COUNTRY_CODE,\n", + " suffix = \"_routine.parquet\",\n", + " label = \"DHIS2 routine data\"\n", + ")\n", "\n", - "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", "head(dhis2_routine, 2)" @@ -206,8 +202,9 @@ }, "outputs": [], "source": [ - "# YEAR and MONTH should be integers; in the input data they are numeric, but we later use them as integers\n", - "dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)" + "# YEAR and MONTH should be integers\n", + "# Use shared helper to normalize types when columns exist\n", + "dhis2_routine <- normalize_year_month_types(dhis2_routine)" ] }, { @@ -268,9 +265,13 @@ }, "outputs": [], "source": [ - "dhis2_routine_long <- dhis2_routine %>%\n", - " select(all_of(c(fixed_cols, DHIS2_INDICATORS))) %>%\n", - " pivot_longer(cols = all_of(DHIS2_INDICATORS), names_to = \"INDICATOR\", values_to = \"VALUE\")\n", + "# Use shared helper to select, pivot and deduplicate routine rows\n", + "dhis2_routine_long <- prepare_routine_long(\n", + " routine_df = dhis2_routine,\n", + " fixed_cols = fixed_cols,\n", + " indicators = DHIS2_INDICATORS,\n", + " deduplicate = TRUE\n", + ")\n", "\n", "print(dim(dhis2_routine_long))\n", "head(dhis2_routine_long, 2)" @@ -295,19 +296,8 @@ }, "outputs": [], "source": [ - "# check if there are any duplicates\n", - "duplicated <- dhis2_routine_long %>%\n", - " group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, YEAR, MONTH, INDICATOR) %>%\n", - " summarise(n = dplyr::n(), .groups= \"drop\") %>%\n", - " filter(n > 1L)\n", - "\n", - "# Remove dups\n", - "if (nrow(duplicated) > 0) {\n", - " log_msg(glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", - " dhis2_routine_long <- dhis2_routine_long %>%\n", - " distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, YEAR, MONTH, INDICATOR, .keep_all = TRUE)\n", - " head(duplicated)\n", - "}" + "# Duplicates are handled by prepare_routine_long(..., deduplicate = TRUE)\n", + "log_msg(\"Routine long data prepared with shared helper (deduplication applied).\")" ] }, { @@ -685,7 +675,12 @@ }, "outputs": [], "source": [ - "output_path <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", + "output_path <- standard_output_path(\n", + " data_root_path = DATA_PATH,\n", + " domain = \"dhis2\",\n", + " subdomain = \"outliers_imputation\",\n", + " create_dir = TRUE\n", + ")\n", "\n", "# Mean detection table (for DB and reporting)\n", "outlier_col <- colnames(dhis2_routine_outliers_selection)[startsWith(colnames(dhis2_routine_outliers_selection), \"OUTLIER_\")][1]\n", From b4bd49a3261838fe7ba775ba3eeb9daad142b73b Mon Sep 17 00:00:00 2001 From: claude-marie Date: Wed, 25 Mar 2026 10:10:06 +0100 Subject: [PATCH 05/23] Refactor outlier imputation and reporting notebooks to utilize new shared helper scripts. Removed the old bootstrap script and updated paths for loading configuration and utility functions, enhancing modularity and readability across the pipeline. --- .../snt_dhis2_outliers_imputation_iqr.ipynb | 13 ++--- ...dhis2_outliers_imputation_iqr_report.ipynb | 17 +++---- .../utils/bootstrap.R | 45 ------------------ ....R => snt_dhis2_outliers_imputation_iqr.r} | 46 ++++++++++++++++++ ...nt_dhis2_outliers_imputation_iqr_report.r} | 47 +++++++++++++++++++ 5 files changed, 105 insertions(+), 63 deletions(-) delete mode 100644 pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R rename pipelines/snt_dhis2_outliers_imputation_iqr/utils/{imputation_utils.R => snt_dhis2_outliers_imputation_iqr.r} (55%) rename pipelines/snt_dhis2_outliers_imputation_iqr/utils/{reporting_utils.R => snt_dhis2_outliers_imputation_iqr_report.r} (81%) diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb index dd66522..1731e12 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/code/snt_dhis2_outliers_imputation_iqr.ipynb @@ -61,8 +61,8 @@ "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_iqr\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (code)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_iqr.r\"))\n", "setup_ctx <- bootstrap_iqr_context(\n", " root_path = ROOT_PATH,\n", " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", @@ -71,10 +71,7 @@ "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Pipeline-specific helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { @@ -528,7 +525,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R\n", + "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr.r\n", "start_time <- Sys.time()" ] }, @@ -606,7 +603,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb index 2fea8de..871e3eb 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb @@ -37,8 +37,8 @@ "SNT_ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_iqr\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (report)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_iqr_report.r\"))\n", "setup_ctx <- bootstrap_iqr_context(\n", " root_path = SNT_ROOT_PATH,\n", " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", @@ -46,10 +46,7 @@ "\n", "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Reporting helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { @@ -88,7 +85,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/reporting_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" ] }, { @@ -222,7 +219,7 @@ }, "outputs": [], "source": [ - "# Plot helpers loaded from utils/reporting_utils.R\n", + "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r\n", "# - plot_outliers()\n", "# - plot_outliers_by_district_facet_year()" ] @@ -671,7 +668,7 @@ }, "outputs": [], "source": [ - "# Coherence heatmap helper loaded from utils/reporting_utils.R" + "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" ] }, { @@ -738,7 +735,7 @@ }, "outputs": [], "source": [ - "# Coherence map helper loaded from utils/reporting_utils.R" + "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R deleted file mode 100644 index 4f9e7b3..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/bootstrap.R +++ /dev/null @@ -1,45 +0,0 @@ -# Shared bootstrap for the IQR outliers pipeline notebooks. -bootstrap_iqr_context <- function( - root_path = "~/workspace", - required_packages = c( - "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", - "reticulate", "glue", "zoo" - ), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - config_path <- file.path(root_path, "configuration") - data_path <- file.path(root_path, "data") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(required_packages) - - Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") - Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - - openhexa <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - } - - config_json <- tryCatch( - { - jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) - }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") - log_msg(msg) - stop(msg) - } - ) - - return(list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - CONFIG_PATH = config_path, - DATA_PATH = data_path, - openhexa = openhexa, - config_json = config_json - )) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r similarity index 55% rename from pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R rename to pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r index da3b1e9..35204c2 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/imputation_utils.R +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr.r @@ -1,3 +1,49 @@ +# Shared bootstrap for the IQR outliers pipeline notebooks. +bootstrap_iqr_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} + # Compute moving-average imputations for a selected outlier flag column. impute_outliers_dt <- function(dt, outlier_col) { dt <- data.table::as.data.table(dt) diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r similarity index 81% rename from pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R rename to pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r index cc6c620..6625b45 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/reporting_utils.R +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r @@ -1,3 +1,50 @@ +# Shared bootstrap for the IQR outliers reporting notebook. +bootstrap_iqr_context <- function( + root_path = "~/workspace", + required_packages = c( + "dplyr", "tidyr", "terra", "ggplot2", "stringr", "lubridate", + "viridis", "patchwork", "zoo", "scales", "purrr", "arrow", + "sf", "reticulate", "knitr", "glue", "forcats" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} + printdim <- function(df, name = deparse(substitute(df))) { cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") } From 9b838fa279499a9265e7644738aac4f1e77d8fb3 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Wed, 25 Mar 2026 12:07:04 +0100 Subject: [PATCH 06/23] Refactor IQR outliers reporting notebook and utility functions to enhance modularity and readability. Updated helper functions for plotting outliers and added error handling for missing data. Improved configuration loading and path management for better integration across the pipeline. --- ...dhis2_outliers_imputation_iqr_report.ipynb | 379 +++++--------- ...snt_dhis2_outliers_imputation_iqr_report.r | 494 +++++++++++------- 2 files changed, 425 insertions(+), 448 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb index 871e3eb..a927dd7 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/reporting/snt_dhis2_outliers_imputation_iqr_report.ipynb @@ -2,36 +2,33 @@ "cells": [ { "cell_type": "markdown", - "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", "metadata": {}, "source": [ "## **Détection des valeurs aberrantes — Méthode IQR**" - ] + ], + "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233" }, { "cell_type": "code", - "execution_count": null, - "id": "43794265-533f-4035-bf3d-975a3409507b", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "## CONFIGURATION ##" - ] + ], + "execution_count": null, + "outputs": [], + "id": "43794265-533f-4035-bf3d-975a3409507b" }, { "cell_type": "code", - "execution_count": null, - "id": "2ced7513-0ee6-4b9b-ac07-124e510119af", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", @@ -47,18 +44,18 @@ "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "openhexa <- setup_ctx$openhexa" - ] + ], + "execution_count": null, + "outputs": [], + "id": "2ced7513-0ee6-4b9b-ac07-124e510119af" }, { "cell_type": "code", - "execution_count": null, - "id": "9e6b91b3-c196-4a1f-bc3d-a4bec5b90e51", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Load SNT config from bootstrap context\n", "config_json <- setup_ctx$config_json\n", @@ -72,32 +69,32 @@ "ADM_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", "facility_level <- config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL" - ] + ], + "execution_count": null, + "outputs": [], + "id": "9e6b91b3-c196-4a1f-bc3d-a4bec5b90e51" }, { "cell_type": "code", - "execution_count": null, - "id": "f8edc2a5-07ce-4507-9939-4322fc510593", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f8edc2a5-07ce-4507-9939-4322fc510593" }, { "cell_type": "code", - "execution_count": null, - "id": "25362e00-96b5-4200-be45-cdeeff9ce3ac", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# import routine data\n", "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine_outliers_detected.parquet\")) }, \n", @@ -122,26 +119,26 @@ " })\n", "\n", "printdim(routine_data)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "25362e00-96b5-4200-be45-cdeeff9ce3ac" }, { "cell_type": "markdown", - "id": "c3cee574-8d66-4cd5-8fe6-97f39daa158b", "metadata": {}, "source": [ "### 1. Résumé des valeurs aberrantes détectées dans les données de routine" - ] + ], + "id": "c3cee574-8d66-4cd5-8fe6-97f39daa158b" }, { "cell_type": "code", - "execution_count": null, - "id": "5ef732f5-52a8-4abc-87ba-7ca77f6c85f2", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "outlier_flags <- routine_data %>%\n", " # Keep only OUTLIER_* columns that are logical flags.\n", @@ -178,26 +175,26 @@ "}\n", "\n", "outlier_summary_long" - ] + ], + "execution_count": null, + "outputs": [], + "id": "5ef732f5-52a8-4abc-87ba-7ca77f6c85f2" }, { "cell_type": "markdown", - "id": "35bcc286-cde1-47bd-99ab-3a6f6b39ac5d", "metadata": {}, "source": [ "### 2. Visualisation des valeurs aberrantes (méthode IQR)" - ] + ], + "id": "35bcc286-cde1-47bd-99ab-3a6f6b39ac5d" }, { "cell_type": "code", - "execution_count": null, - "id": "abeae17b-935d-49d9-a239-89985e469d81", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "#--- PARAMETERS ---\n", "outlier_cols <- if (\"OUTLIER_DETECTED\" %in% names(routine_data)) {\n", @@ -206,27 +203,29 @@ " routine_data %>% select(starts_with(\"OUTLIER_\")) %>% names()\n", "}\n", "print(outlier_cols)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "abeae17b-935d-49d9-a239-89985e469d81" }, { "cell_type": "code", - "execution_count": null, - "id": "99dd199e-bcf1-4900-b96e-f0f3285caec2", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r\n", "# - plot_outliers()\n", "# - plot_outliers_by_district_facet_year()" - ] + ], + "execution_count": null, + "outputs": [], + "id": "99dd199e-bcf1-4900-b96e-f0f3285caec2" }, { "cell_type": "markdown", - "id": "764e6f6a-f810-4077-8ed4-6d5b24c4caf4", "metadata": {}, "source": [ "### Include plots \n", @@ -234,18 +233,16 @@ "-Clean folder \n", "-Save Images \n", "-Load the images " - ] + ], + "id": "764e6f6a-f810-4077-8ed4-6d5b24c4caf4" }, { "cell_type": "code", - "execution_count": null, - "id": "8e4834fe-16d3-40ca-91a3-a38cd8e301d0", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Create folder if it doesn't exist\n", "output_dir <- file.path(getwd(), \"outputs/plots\")\n", @@ -255,18 +252,18 @@ " files <- list.files(output_dir, full.names = TRUE)\n", " if (length(files) > 0) file.remove(files)\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "8e4834fe-16d3-40ca-91a3-a38cd8e301d0" }, { "cell_type": "code", - "execution_count": null, - "id": "df34b8d6-489f-42d7-a9b3-3862edc6b780", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "selected_inds <- c(\"SUSP\", \"TEST\", \"CONF\")\n", "\n", @@ -288,19 +285,21 @@ "for (img in img_files) {\n", " IRdisplay::display_png(file = img)\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "df34b8d6-489f-42d7-a9b3-3862edc6b780" }, { "cell_type": "markdown", - "id": "1ad3de61-1ff3-430b-83d8-f1c9fc924b43", "metadata": {}, "source": [ "### 3. Cohérence des indicateurs au niveau nationale" - ] + ], + "id": "1ad3de61-1ff3-430b-83d8-f1c9fc924b43" }, { "cell_type": "markdown", - "id": "bd92f817-83a5-4597-b547-c9f0a506c08a", "metadata": {}, "source": [ "La section ci-dessous est un extrait des explications fournies par la **Community code library for SNT**. Veuillez consulter le site Web pour obtenir des explications complètes: https://ahadi-analytics.github.io/snt-code-library/english/library/data/routine_cases/quality_control.html#cb19-55\n", @@ -320,18 +319,16 @@ "**Décès toutes causes confondues ≥ décès dus au paludisme**: un rapport de 1:1 implique que tous les décès déclarés sont attribués au paludisme. Lorsque les décès toutes causes confondues sont inférieurs aux décès dus au paludisme, cela reflète des problèmes de qualité des données liés à une déclaration inadéquate de l'une ou des deux variables.\n", "\n", "**Admissions pour paludisme ≥ décès dus au paludisme**: un rapport de 1:1 implique que tous les cas de paludisme hospitalisés sont décédés, soit un taux de mortalité hospitalière de 100 %. Lorsque les admissions pour paludisme sont inférieures aux décès dus au paludisme, cela reflète des problèmes de qualité des données liés à une déclaration inadéquate de l'une ou des deux variables." - ] + ], + "id": "bd92f817-83a5-4597-b547-c9f0a506c08a" }, { "cell_type": "code", - "execution_count": null, - "id": "4ff0b337-2370-44ca-ac66-24068e7680c0", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Step 1: Extract year, month from PERIOD & aggregate\n", "routine_month <- routine_data_imputed %>%\n", @@ -348,18 +345,18 @@ " PRES = sum(PRES, na.rm = TRUE),\n", " .groups = \"drop\"\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "4ff0b337-2370-44ca-ac66-24068e7680c0" }, { "cell_type": "code", - "execution_count": null, - "id": "e183891f-c823-4fa2-9cbf-74143a8526fc", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Step 2: Plot monthly national trends\n", "options(repr.plot.width = 14, repr.plot.height = 6)\n", @@ -379,18 +376,18 @@ " legend.title = element_text(size = 16),\n", " legend.text = element_text(size = 16)\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "e183891f-c823-4fa2-9cbf-74143a8526fc" }, { "cell_type": "code", - "execution_count": null, - "id": "408b0396-346b-45a5-9358-6375400a6767", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Identify indicator columns automatically (all numeric except YEAR, MONTH, IDs)\n", "indicator_cols <- routine_data_imputed %>%\n", @@ -404,18 +401,18 @@ " ungroup()\n", "\n", "yearly_totals %>% select(YEAR, SUSP, TEST, CONF)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "408b0396-346b-45a5-9358-6375400a6767" }, { "cell_type": "code", - "execution_count": null, - "id": "54c415c8-8431-413c-9373-4d957776eff7", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Step 3: Create scatter plots\n", "routine_hd_month <- routine_data_imputed %>%\n", @@ -456,257 +453,138 @@ "\n", "# Step 3: Combine plots\n", "(p1 | p2 | p3) + plot_layout(guides = \"collect\")" - ] + ], + "execution_count": null, + "outputs": [], + "id": "54c415c8-8431-413c-9373-4d957776eff7" }, { "cell_type": "markdown", - "id": "61c5a165-af0e-45a4-9d2e-8901451a4d6b", "metadata": {}, "source": [ "Le graphique en bas montre le **pourcentage de rapports mensuels des formations sanitaires au niveau national** qui ont passé chaque contrôle de cohérence pour chaque année. Chaque cellule indique la proportion de rapports mensuels d’une année donnée qui respectent la règle de cohérence correspondante. Évaluer ces contrôles d’une année à l’autre et entre catégories permet d’identifier les **tendances générales de la qualité des données**." - ] + ], + "id": "61c5a165-af0e-45a4-9d2e-8901451a4d6b" }, { "cell_type": "code", - "execution_count": null, - "id": "4e530e1c-c6be-4de0-bc23-57ce1135ed59", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ - "# ---- 0. Define the checks, columns and labels ----\n", - "checks <- list(\n", - " allout_susp = c(\"ALLOUT\", \"SUSP\"), \n", - " allout_test = c(\"ALLOUT\", \"TEST\"), \n", - " susp_test = c(\"SUSP\", \"TEST\"), \n", - " test_conf = c(\"TEST\", \"CONF\"), \n", - " conf_treat = c(\"CONF\", \"MALTREAT\"), \n", - " adm_dth = c(\"MALADM\", \"MALDTH\") \n", - ")\n", - "\n", - "check_labels <- c(\n", - " pct_coherent_allout_susp = \"Ambulatoire ≥ Suspects\",\n", - " pct_coherent_allout_test = \"Ambulatoire ≥ Testés\",\n", - " pct_coherent_susp_test = \"Suspects ≥ Testés\",\n", - " pct_coherent_test_conf = \"Testés ≥ Confirmés\",\n", - " pct_coherent_conf_treat = \"Confirmés ≥ Traités\",\n", - " pct_coherent_adm_dth = \"Admissions Palu ≥ Décès Palu\"\n", - ")" - ] + "# Coherence definitions loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r\n", + "defs <- get_coherence_definitions()\n", + "checks <- defs$checks\n", + "check_labels <- defs$check_labels" + ], + "execution_count": null, + "outputs": [], + "id": "4e530e1c-c6be-4de0-bc23-57ce1135ed59" }, { "cell_type": "code", - "execution_count": null, - "id": "74f70f5c-a56d-4c47-bc5a-3cc828759d54", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency checks dynamically ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# ---- 2. Summarise percent coherent per year ----\n", - "check_cols <- intersect(paste0(\"check_\", names(checks)), names(df_checks))\n", - "\n", - "coherency_metrics <- df_checks %>%\n", - " group_by(YEAR) %>%\n", - " summarise(\n", - " across(all_of(check_cols), ~ mean(.x, na.rm = TRUE) * 100,\n", - " .names = \"pct_{.col}\"),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"check_type\",\n", - " names_prefix = \"pct_check_\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent)) %>% # <-- remove missing checks entirely\n", - " mutate(\n", - " check_label = recode(\n", - " check_type,\n", - " !!!setNames(check_labels, sub(\"^pct_coherent_\", \"\", names(check_labels)))\n", - " ),\n", - " check_label = factor(check_label, levels = unique(check_label)), # preserve only existing levels\n", - " check_label = fct_reorder(check_label, pct_coherent, .fun = median, na.rm = TRUE)\n", - " )\n", - "\n", - "# ---- 3. Heatmap ----\n", - "coherency_plot <- ggplot(coherency_metrics, aes(\n", - " x = factor(YEAR),\n", - " y = check_label,\n", - " fill = pct_coherent\n", - ")) +\n", - " geom_tile(color = NA, width = 0.88, height = 0.88) +\n", - " geom_text(\n", - " aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " color = \"white\",\n", - " fontface = \"bold\",\n", - " size = 5\n", - " ) +\n", - " scale_fill_viridis(\n", - " name = \"% Cohérent\",\n", - " option = \"viridis\",\n", - " limits = c(0, 100),\n", - " direction = -1\n", - " ) +\n", - " labs(\n", - " title = \"Contrôles de cohérence des données (niveau national)\",\n", - " x = \"Année\",\n", - " y = NULL\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " plot.title = element_text(size = 22, face = \"bold\", hjust = 0.5),\n", - " axis.text.y = element_text(size = 16, hjust = 0),\n", - " axis.text.x = element_text(size = 16),\n", - " legend.title = element_text(size = 16, face = \"bold\"),\n", - " legend.text = element_text(size = 14),\n", - " legend.key.width = unit(0.7, \"cm\"),\n", - " legend.key.height = unit(1.2, \"cm\")\n", - " )\n", + "# National coherence summary and plot via report utils\n", + "coherency_metrics <- compute_national_coherency_metrics(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels\n", + ")\n", "\n", + "coherency_plot <- plot_national_coherence_heatmap(coherency_metrics)\n", "coherency_plot" - ] + ], + "execution_count": null, + "outputs": [], + "id": "74f70f5c-a56d-4c47-bc5a-3cc828759d54" }, { "cell_type": "markdown", - "id": "da37c36d-bf7e-4ed7-8cfc-b377329b8d89", "metadata": {}, "source": [ "### 4. Visualisation de la cohérence au niveau du AMD1" - ] + ], + "id": "da37c36d-bf7e-4ed7-8cfc-b377329b8d89" }, { "cell_type": "code", - "execution_count": null, - "id": "ca2d499c-4f17-4ec0-a090-3ca75daa914e", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency check per row safely ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA_real_)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# Identify the check columns that actually exist\n", - "check_cols <- names(df_checks)[grepl(\"^check_\", names(df_checks))]\n", - "\n", - "valid_checks <- check_cols[\n", - " purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x)))\n", - "]\n", - "\n", - "# Compute coherence\n", - "adm_coherence <- df_checks %>%\n", - " group_by(ADM1_NAME, ADM2_NAME, ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " total_reports = n(),\n", - " !!!purrr::map(\n", - " valid_checks,\n", - " ~ expr(100 * mean(.data[[.x]], na.rm = TRUE))\n", - " ) %>%\n", - " setNames(paste0(\"pct_coherent_\", sub(\"^check_\", \"\", valid_checks))),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " filter(total_reports >= 5)\n", - "\n", - "# To long format\n", - "adm_long <- adm_coherence %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_coherent_\"),\n", - " names_to = \"check_type\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent))\n", + "# ADM coherence summaries via report utils\n", + "adm_coherence_data <- compute_adm_coherence_long(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels,\n", + " min_reports = 5\n", + ")\n", "\n", - "adm_long <- adm_long %>% mutate(check_label = recode(check_type, !!!check_labels))\n", + "adm_coherence <- adm_coherence_data$adm_coherence\n", + "adm_long <- adm_coherence_data$adm_long\n", "\n", "head(adm_long)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "ca2d499c-4f17-4ec0-a090-3ca75daa914e" }, { "cell_type": "code", - "execution_count": null, - "id": "c159c47c-a2b1-411a-98fb-50d5173ebd0b", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" - ] + ], + "execution_count": null, + "outputs": [], + "id": "c159c47c-a2b1-411a-98fb-50d5173ebd0b" }, { "cell_type": "code", - "execution_count": null, - "id": "f11eb536-b633-40fb-b182-f67ea51e0c66", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Plot per year\n", "years_available <- sort(unique(adm_long$YEAR))\n", "for (year in years_available) {\n", " plot_coherence_heatmap(df = adm_long, selected_year = year, agg_level = \"ADM1_NAME\")\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f11eb536-b633-40fb-b182-f67ea51e0c66" }, { "cell_type": "markdown", - "id": "76315fa7-2fa2-4ea2-baf2-bc427b3f659a", "metadata": {}, "source": [ "### 5. Visualisation de la cohérence au niveau du AMD2" - ] + ], + "id": "76315fa7-2fa2-4ea2-baf2-bc427b3f659a" }, { "cell_type": "code", - "execution_count": null, - "id": "8930acc3-aafa-4ecb-9a1d-a48fc2faf1cd", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "shapes_data <- shapes_data %>%\n", " mutate(ADM2_ID = as.character(ADM2_ID))\n", @@ -722,32 +600,32 @@ " ADM2_NAME_shape = ADM2_NAME.x,\n", " ADM2_NAME_data = ADM2_NAME.y\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "8930acc3-aafa-4ecb-9a1d-a48fc2faf1cd" }, { "cell_type": "code", - "execution_count": null, - "id": "002ca971-fdff-4e12-ad0f-5daae51ba126", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_iqr_report.r" - ] + ], + "execution_count": null, + "outputs": [], + "id": "002ca971-fdff-4e12-ad0f-5daae51ba126" }, { "cell_type": "code", - "execution_count": null, - "id": "88982e8e-43f4-4b6d-9f04-a61a03217c8d", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Loop over all available columns\n", "for (check_col in names(check_labels)) {\n", @@ -763,19 +641,22 @@ " # width = 14, height = 10, dpi = 300)\n", " }\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "88982e8e-43f4-4b6d-9f04-a61a03217c8d" }, { "cell_type": "code", - "execution_count": null, - "id": "5d0b89bb-53c8-4d0d-aa39-71341956f1d0", "metadata": { "vscode": { "languageId": "r" } }, + "source": [], + "execution_count": null, "outputs": [], - "source": [] + "id": "5d0b89bb-53c8-4d0d-aa39-71341956f1d0" } ], "metadata": { @@ -795,4 +676,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r index 6625b45..38de5d1 100644 --- a/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_iqr/utils/snt_dhis2_outliers_imputation_iqr_report.r @@ -1,213 +1,309 @@ -# Shared bootstrap for the IQR outliers reporting notebook. -bootstrap_iqr_context <- function( - root_path = "~/workspace", - required_packages = c( - "dplyr", "tidyr", "terra", "ggplot2", "stringr", "lubridate", - "viridis", "patchwork", "zoo", "scales", "purrr", "arrow", - "sf", "reticulate", "knitr", "glue", "forcats" - ), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - config_path <- file.path(root_path, "configuration") - data_path <- file.path(root_path, "data") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(required_packages) - - Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") - Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - - openhexa <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - } - - config_json <- tryCatch( - { - jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) - }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") - log_msg(msg) - stop(msg) - } - ) +# Report helpers for the IQR outliers pipeline. - return(list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - CONFIG_PATH = config_path, - DATA_PATH = data_path, - openhexa = openhexa, - config_json = config_json - )) +`%||%` <- function(x, y) if (!is.null(x)) x else y + +# Pull in bootstrap + shared non-report helpers (same folder). +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") +} else if (!is.na(.this_file)) { + dirname(.this_file) +} else { + getwd() } +source(file.path(.this_dir, "snt_dhis2_outliers_imputation_iqr.r")) printdim <- function(df, name = deparse(substitute(df))) { - cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") + if (is.null(df)) { + message(sprintf("%s: NULL", name)) + return(invisible(NULL)) + } + d <- dim(df) + message(sprintf("%s: %s x %s", name, d[1], d[2])) + invisible(d) } -plot_outliers <- function(ind_name, df, outlier_col) { - df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) - df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) - - ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + - ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - ggplot2::aes(x = YEAR, y = VALUE), - color = "red", - size = 2.8, - alpha = 0.85, - na.rm = TRUE - ) + - ggplot2::labs( - title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), - subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", - x = "Année", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 14) +plot_outliers <- function(ind_name, df, outlier_col = "OUTLIER_DETECTED") { + if (!ind_name %in% names(df)) return(NULL) + if (!outlier_col %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate( + YEAR = as.integer(.data$YEAR %||% substr(.data$PERIOD, 1, 4)), + MONTH = as.integer(.data$MONTH %||% substr(.data$PERIOD, 5, 6)), + DATE = as.Date(sprintf("%04d-%02d-01", YEAR, MONTH)) + ) %>% + dplyr::group_by(.data$DATE) %>% + dplyr::summarise( + value = sum(.data[[ind_name]], na.rm = TRUE), + has_outlier = any(.data[[outlier_col]] %in% TRUE, na.rm = TRUE), + .groups = "drop" + ) + + ggplot2::ggplot(d, ggplot2::aes(x = .data$DATE, y = .data$value)) + + ggplot2::geom_line(linewidth = 0.8, color = "grey40") + + ggplot2::geom_point(ggplot2::aes(color = .data$has_outlier), size = 2, alpha = 0.9) + + ggplot2::scale_color_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "#0072B2")) + + ggplot2::labs( + title = sprintf("Outliers - %s (%s)", ind_name, outlier_col), + x = "Mois", + y = "Valeur agregee", + color = "Outlier present" + ) + + ggplot2::theme_minimal(base_size = 14) } -plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { - df_ind <- df %>% - dplyr::filter( - INDICATOR == ind_name, - !is.na(YEAR), - !is.na(VALUE), - is.finite(VALUE) - ) - - if (nrow(df_ind) == 0) { - return(NULL) - } - - ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + - ggplot2::geom_point(color = "grey60", alpha = 0.3) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - color = "red", - size = 2.8, - alpha = 0.85 - ) + - ggplot2::facet_wrap(~ YEAR, scales = "free_y") + - ggplot2::labs( - title = paste("Détection des valeurs aberrantes —", ind_name), - subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), - x = "District (ADM2)", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 13) + - ggplot2::theme( - axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7) - ) +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col = "OUTLIER_DETECTED") { + if (!ind_name %in% names(df)) return(NULL) + if (!outlier_col %in% names(df)) return(NULL) + if (!("ADM2_NAME" %in% names(df) && "ADM2_ID" %in% names(df))) return(NULL) + + d <- df %>% + dplyr::mutate( + YEAR = as.integer(.data$YEAR %||% substr(.data$PERIOD, 1, 4)), + MONTH = as.integer(.data$MONTH %||% substr(.data$PERIOD, 5, 6)), + DATE = as.Date(sprintf("%04d-%02d-01", YEAR, MONTH)) + ) %>% + dplyr::group_by(.data$ADM2_ID, .data$ADM2_NAME, .data$YEAR, .data$MONTH, .data$DATE) %>% + dplyr::summarise( + value = sum(.data[[ind_name]], na.rm = TRUE), + has_outlier = any(.data[[outlier_col]] %in% TRUE, na.rm = TRUE), + .groups = "drop" + ) + + if (nrow(d) == 0) return(NULL) + + ggplot2::ggplot( + d, + ggplot2::aes(x = .data$DATE, y = .data$value, group = .data$ADM2_ID) + ) + + ggplot2::geom_line(alpha = 0.35, linewidth = 0.4, color = "grey40") + + ggplot2::geom_point(ggplot2::aes(color = .data$has_outlier), alpha = 0.75, size = 1) + + ggplot2::scale_color_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "grey70")) + + ggplot2::facet_wrap(~YEAR, scales = "free_x") + + ggplot2::labs( + title = sprintf("Outliers par district - %s (%s)", ind_name, outlier_col), + x = "Mois", + y = "Valeur (ADM2 agrege)", + color = "Outlier" + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme( + legend.position = "bottom", + strip.text = ggplot2::element_text(face = "bold") + ) } -plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { - if (!agg_level %in% names(df)) { - stop(paste0("Aggregation level '", agg_level, "' not found in data!")) - } - - df_year <- df %>% - dplyr::filter(YEAR == selected_year) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% - dplyr::summarise( - pct_coherent = mean(pct_coherent, na.rm = TRUE), - .groups = "drop" - ) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% - dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% - dplyr::ungroup() %>% - dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) - - n_units <- dplyr::n_distinct(df_year[[agg_level]]) - plot_height <- max(6, 0.5 * n_units) - agg_label <- if (agg_level == "ADM1_NAME") { - "niveau administratif 1" - } else if (agg_level == "ADM2_NAME") { - "niveau administratif 2" - } else { - agg_level - } - - p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + - ggplot2::geom_tile(color = "white", linewidth = 0.2) + - ggplot2::geom_text( - ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), - size = 5, - fontface = "bold", - color = "white" - ) + - viridis::scale_fill_viridis( - name = "% cohérent", - limits = c(0, 100), - option = "viridis", - direction = -1 - ) + - ggplot2::labs( - title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), - x = "Règle de cohérence", - y = agg_label - ) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - axis.text.y = ggplot2::element_text(size = 12), - axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), - plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), - legend.title = ggplot2::element_text(size = 12), - legend.text = ggplot2::element_text(size = 10) - ) - - options(repr.plot.width = 14, repr.plot.height = plot_height) - - if (!is.null(filename)) { - ggplot2::ggsave( - filename = filename, - plot = p, - width = 14, - height = plot_height, - dpi = 300, - limitsize = FALSE - ) - } - if (do_plot) { - print(p) - } +plot_coherence_heatmap <- function( + df, + selected_year, + agg_level = "ADM1_NAME", + filename = NULL, + do_plot = TRUE +) { + if (!all(c("YEAR", "check_label", "pct_coherent") %in% names(df))) return(NULL) + if (!agg_level %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate(YEAR = as.integer(.data$YEAR)) %>% + dplyr::filter(.data$YEAR == as.integer(selected_year)) %>% + dplyr::mutate( + agg = as.character(.data[[agg_level]]), + check_label = as.character(.data$check_label) + ) + + if (nrow(d) == 0) return(NULL) + + p <- ggplot2::ggplot(d, ggplot2::aes( + x = .data$check_label, + y = .data$agg, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile() + + ggplot2::scale_fill_viridis_c( + name = "% coherent", + option = "viridis", + limits = c(0, 100) + ) + + ggplot2::labs( + title = sprintf("Coherence (%s) - %s", agg_level, selected_year), + x = NULL, + y = NULL + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), + plot.title = ggplot2::element_text(face = "bold") + ) + + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = 8, dpi = 150) + } + + if (do_plot) print(p) + invisible(p) } plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { - if (!col_name %in% names(map_data)) { - stop(paste0("Column '", col_name, "' not found in the data!")) - } - - if (is.null(indicator_label)) { - indicator_label <- col_name - } - - ggplot2::ggplot(map_data) + - ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + - viridis::scale_fill_viridis( - name = paste0("% cohérence\n(", indicator_label, ")"), - option = "magma", - direction = -1, - limits = c(0, 100), - na.value = "grey90" - ) + - ggplot2::facet_wrap(~ YEAR, drop = TRUE) + - ggplot2::labs( - title = "Cohérence des données par niveau administratif 2 et par année", - subtitle = paste("Indicateur :", indicator_label), - caption = "Source : DHIS2 données routinières" - ) + - ggplot2::theme_minimal(base_size = 15) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - strip.text = ggplot2::element_text(size = 14, face = "bold"), - plot.title = ggplot2::element_text(size = 20, face = "bold"), - legend.position = "right" - ) + if (!inherits(map_data, "sf")) return(NULL) + if (!col_name %in% names(map_data)) return(NULL) + + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = NA) + + ggplot2::scale_fill_viridis_c( + option = "viridis", + name = indicator_label %||% col_name, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::labs(title = indicator_label %||% col_name) + + ggplot2::theme_void(base_size = 12) + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", hjust = 0.5), + legend.position = "right" + ) +} + +get_coherence_definitions <- function() { + checks <- list( + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") + ) + + check_labels <- c( + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" + ) + + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) + ) +} + +plot_national_coherence_heatmap <- function(coherency_metrics) { + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 + ) + + ggplot2::labs( + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") + ) +} + +compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) + ) %>% + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) } From 776887dab5ea567c374461cb280064b80d83045c Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 26 Mar 2026 13:19:15 +0100 Subject: [PATCH 07/23] fix for outliers pipelines --- ...s2_outliers_imputation_magic_glasses.ipynb | 4 +- ...iers_imputation_magic_glasses_report.ipynb | 2 +- ..._dhis2_outliers_imputation_magic_glasses.r | 6 + ...outliers_imputation_magic_glasses_report.r | 5 + .../snt_dhis2_outliers_imputation_mean.ipynb | 9 +- ...his2_outliers_imputation_mean_report.ipynb | 168 +++--------------- .../snt_dhis2_outliers_imputation_mean.r | 6 + ...nt_dhis2_outliers_imputation_mean_report.r | 150 ++++++++++++++++ ...snt_dhis2_outliers_imputation_median.ipynb | 9 +- ...s2_outliers_imputation_median_report.ipynb | 168 +++--------------- .../snt_dhis2_outliers_imputation_median.r | 6 + ..._dhis2_outliers_imputation_median_report.r | 150 ++++++++++++++++ 12 files changed, 377 insertions(+), 306 deletions(-) create mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r create mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r create mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r create mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r create mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r create mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index 78a0083..211d395 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -64,7 +64,7 @@ "outputs": [], "source": [ "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_magic_glasses.r\"))\n", "\n", "required_packages <- c(\"arrow\", \"data.table\", \"jsonlite\", \"reticulate\", \"glue\")\n", "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", @@ -84,8 +84,6 @@ "DATA_PATH <- setup_ctx$DATA_PATH\n", "openhexa <- setup_ctx$openhexa\n", "\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"magic_glasses_utils.R\"))\n", - "\n", "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", " log_msg(\"[WARNING] Complete mode: seasonal detection is very computationally intensive and can take several hours to run.\", \"warning\")\n", "}\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb index 3ceddcb..2340133 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/reporting/snt_dhis2_outliers_imputation_magic_glasses_report.ipynb @@ -30,7 +30,7 @@ "ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", "\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_magic_glasses_report.r\"))\n", "setup_ctx <- bootstrap_magic_glasses_context(\n", " root_path = ROOT_PATH,\n", " required_packages = c(\"jsonlite\", \"arrow\", \"glue\", \"reticulate\", \"dplyr\", \"ggplot2\", \"knitr\", \"scales\")\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r new file mode 100644 index 0000000..af6ad57 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r @@ -0,0 +1,6 @@ +# Main helpers for magic glasses outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "magic_glasses_utils.R")) + diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r new file mode 100644 index 0000000..af9912d --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r @@ -0,0 +1,5 @@ +# Report helpers for magic glasses outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) + diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index 62c5d49..cdefca5 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -61,8 +61,8 @@ "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_mean\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (code)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_mean.r\"))\n", "setup_ctx <- bootstrap_outliers_context(\n", " root_path = ROOT_PATH,\n", " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", @@ -71,10 +71,7 @@ "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Pipeline-specific helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb index ce58bf5..ca1ba0d 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb @@ -37,8 +37,8 @@ "SNT_ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_mean\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (report)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_mean_report.r\"))\n", "setup_ctx <- bootstrap_outliers_context(\n", " root_path = SNT_ROOT_PATH,\n", " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", @@ -46,10 +46,7 @@ "\n", "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Reporting helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { @@ -480,24 +477,10 @@ }, "outputs": [], "source": [ - "# ---- 0. Define the checks, columns and labels ----\n", - "checks <- list(\n", - " allout_susp = c(\"ALLOUT\", \"SUSP\"), \n", - " allout_test = c(\"ALLOUT\", \"TEST\"), \n", - " susp_test = c(\"SUSP\", \"TEST\"), \n", - " test_conf = c(\"TEST\", \"CONF\"), \n", - " conf_treat = c(\"CONF\", \"MALTREAT\"), \n", - " adm_dth = c(\"MALADM\", \"MALDTH\") \n", - ")\n", - "\n", - "check_labels <- c(\n", - " pct_coherent_allout_susp = \"Ambulatoire ≥ Suspects\",\n", - " pct_coherent_allout_test = \"Ambulatoire ≥ Testés\",\n", - " pct_coherent_susp_test = \"Suspects ≥ Testés\",\n", - " pct_coherent_test_conf = \"Testés ≥ Confirmés\",\n", - " pct_coherent_conf_treat = \"Confirmés ≥ Traités\",\n", - " pct_coherent_adm_dth = \"Admissions Palu ≥ Décès Palu\"\n", - ")" + "# Coherence definitions loaded from utils/snt_dhis2_outliers_imputation_mean_report.r\n", + "defs <- get_coherence_definitions()\n", + "checks <- defs$checks\n", + "check_labels <- defs$check_labels" ] }, { @@ -511,83 +494,14 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency checks dynamically ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# ---- 2. Summarise percent coherent per year ----\n", - "check_cols <- intersect(paste0(\"check_\", names(checks)), names(df_checks))\n", - "\n", - "coherency_metrics <- df_checks %>%\n", - " group_by(YEAR) %>%\n", - " summarise(\n", - " across(all_of(check_cols), ~ mean(.x, na.rm = TRUE) * 100,\n", - " .names = \"pct_{.col}\"),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"check_type\",\n", - " names_prefix = \"pct_check_\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent)) %>% # <-- remove missing checks entirely\n", - " mutate(\n", - " check_label = recode(\n", - " check_type,\n", - " !!!setNames(check_labels, sub(\"^pct_coherent_\", \"\", names(check_labels)))\n", - " ),\n", - " check_label = factor(check_label, levels = unique(check_label)), # preserve only existing levels\n", - " check_label = fct_reorder(check_label, pct_coherent, .fun = median, na.rm = TRUE)\n", - " )\n", - "\n", - "# ---- 3. Heatmap ----\n", - "coherency_plot <- ggplot(coherency_metrics, aes(\n", - " x = factor(YEAR),\n", - " y = check_label,\n", - " fill = pct_coherent\n", - ")) +\n", - " geom_tile(color = NA, width = 0.88, height = 0.88) +\n", - " geom_text(\n", - " aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " color = \"white\",\n", - " fontface = \"bold\",\n", - " size = 5\n", - " ) +\n", - " scale_fill_viridis(\n", - " name = \"% Cohérent\",\n", - " option = \"viridis\",\n", - " limits = c(0, 100),\n", - " direction = -1\n", - " ) +\n", - " labs(\n", - " title = \"Contrôles de cohérence des données (niveau national)\",\n", - " x = \"Année\",\n", - " y = NULL\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " plot.title = element_text(size = 22, face = \"bold\", hjust = 0.5),\n", - " axis.text.y = element_text(size = 16, hjust = 0),\n", - " axis.text.x = element_text(size = 16),\n", - " legend.title = element_text(size = 16, face = \"bold\"),\n", - " legend.text = element_text(size = 14),\n", - " legend.key.width = unit(0.7, \"cm\"),\n", - " legend.key.height = unit(1.2, \"cm\")\n", - " )\n", + "# National coherence summary and plot via report utils\n", + "coherency_metrics <- compute_national_coherency_metrics(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels\n", + ")\n", "\n", + "coherency_plot <- plot_national_coherence_heatmap(coherency_metrics)\n", "coherency_plot" ] }, @@ -610,52 +524,16 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency check per row safely ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA_real_)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# Identify the check columns that actually exist\n", - "check_cols <- names(df_checks)[grepl(\"^check_\", names(df_checks))]\n", - "\n", - "valid_checks <- check_cols[\n", - " purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x)))\n", - "]\n", - "\n", - "# Compute coherence\n", - "adm_coherence <- df_checks %>%\n", - " group_by(ADM1_NAME, ADM2_NAME, ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " total_reports = n(),\n", - " !!!purrr::map(\n", - " valid_checks,\n", - " ~ expr(100 * mean(.data[[.x]], na.rm = TRUE))\n", - " ) %>%\n", - " setNames(paste0(\"pct_coherent_\", sub(\"^check_\", \"\", valid_checks))),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " filter(total_reports >= 5)\n", - "\n", - "# To long format\n", - "adm_long <- adm_coherence %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_coherent_\"),\n", - " names_to = \"check_type\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent))\n", + "# ADM coherence summaries via report utils\n", + "adm_coherence_data <- compute_adm_coherence_long(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels,\n", + " min_reports = 5\n", + ")\n", "\n", - "adm_long <- adm_long %>% mutate(check_label = recode(check_type, !!!check_labels))\n", + "adm_coherence <- adm_coherence_data$adm_coherence\n", + "adm_long <- adm_coherence_data$adm_long\n", "\n", "head(adm_long)" ] diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r new file mode 100644 index 0000000..df776cd --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r @@ -0,0 +1,6 @@ +# Main helpers for mean outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "imputation_utils.R")) + diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r new file mode 100644 index 0000000..7ff945a --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r @@ -0,0 +1,150 @@ +# Report helpers for mean outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "reporting_utils.R")) + +get_coherence_definitions <- function() { + checks <- list( + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") + ) + + check_labels <- c( + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" + ) + + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) + ) +} + +plot_national_coherence_heatmap <- function(coherency_metrics) { + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 + ) + + ggplot2::labs( + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") + ) +} + +compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) + ) %>% + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) +} + diff --git a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb index 4c81480..a16e999 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb @@ -61,8 +61,8 @@ "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_median\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (code)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_median.r\"))\n", "setup_ctx <- bootstrap_outliers_context(\n", " root_path = ROOT_PATH,\n", " required_packages = c(\"data.table\", \"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\", \"zoo\")\n", @@ -71,10 +71,7 @@ "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Pipeline-specific helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"imputation_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb index 04b02b6..854ca4e 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb @@ -37,8 +37,8 @@ "SNT_ROOT_PATH <- \"~/workspace\"\n", "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_median\")\n", "\n", - "# Shared bootstrap for this pipeline\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"bootstrap.R\"))\n", + "# Shared helpers for this pipeline (report)\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_median_report.r\"))\n", "setup_ctx <- bootstrap_outliers_context(\n", " root_path = SNT_ROOT_PATH,\n", " required_packages = c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\", \"forcats\")\n", @@ -46,10 +46,7 @@ "\n", "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "# Reporting helpers\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"reporting_utils.R\"))" + "openhexa <- setup_ctx$openhexa" ] }, { @@ -480,24 +477,10 @@ }, "outputs": [], "source": [ - "# ---- 0. Define the checks, columns and labels ----\n", - "checks <- list(\n", - " allout_susp = c(\"ALLOUT\", \"SUSP\"), \n", - " allout_test = c(\"ALLOUT\", \"TEST\"), \n", - " susp_test = c(\"SUSP\", \"TEST\"), \n", - " test_conf = c(\"TEST\", \"CONF\"), \n", - " conf_treat = c(\"CONF\", \"MALTREAT\"), \n", - " adm_dth = c(\"MALADM\", \"MALDTH\") \n", - ")\n", - "\n", - "check_labels <- c(\n", - " pct_coherent_allout_susp = \"Ambulatoire ≥ Suspects\",\n", - " pct_coherent_allout_test = \"Ambulatoire ≥ Testés\",\n", - " pct_coherent_susp_test = \"Suspects ≥ Testés\",\n", - " pct_coherent_test_conf = \"Testés ≥ Confirmés\",\n", - " pct_coherent_conf_treat = \"Confirmés ≥ Traités\",\n", - " pct_coherent_adm_dth = \"Admissions Palu ≥ Décès Palu\"\n", - ")" + "# Coherence definitions loaded from utils/snt_dhis2_outliers_imputation_median_report.r\n", + "defs <- get_coherence_definitions()\n", + "checks <- defs$checks\n", + "check_labels <- defs$check_labels" ] }, { @@ -511,83 +494,14 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency checks dynamically ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# ---- 2. Summarise percent coherent per year ----\n", - "check_cols <- intersect(paste0(\"check_\", names(checks)), names(df_checks))\n", - "\n", - "coherency_metrics <- df_checks %>%\n", - " group_by(YEAR) %>%\n", - " summarise(\n", - " across(all_of(check_cols), ~ mean(.x, na.rm = TRUE) * 100,\n", - " .names = \"pct_{.col}\"),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"check_type\",\n", - " names_prefix = \"pct_check_\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent)) %>% # <-- remove missing checks entirely\n", - " mutate(\n", - " check_label = recode(\n", - " check_type,\n", - " !!!setNames(check_labels, sub(\"^pct_coherent_\", \"\", names(check_labels)))\n", - " ),\n", - " check_label = factor(check_label, levels = unique(check_label)), # preserve only existing levels\n", - " check_label = fct_reorder(check_label, pct_coherent, .fun = median, na.rm = TRUE)\n", - " )\n", - "\n", - "# ---- 3. Heatmap ----\n", - "coherency_plot <- ggplot(coherency_metrics, aes(\n", - " x = factor(YEAR),\n", - " y = check_label,\n", - " fill = pct_coherent\n", - ")) +\n", - " geom_tile(color = NA, width = 0.88, height = 0.88) +\n", - " geom_text(\n", - " aes(label = sprintf(\"%.0f%%\", pct_coherent)),\n", - " color = \"white\",\n", - " fontface = \"bold\",\n", - " size = 5\n", - " ) +\n", - " scale_fill_viridis(\n", - " name = \"% Cohérent\",\n", - " option = \"viridis\",\n", - " limits = c(0, 100),\n", - " direction = -1\n", - " ) +\n", - " labs(\n", - " title = \"Contrôles de cohérence des données (niveau national)\",\n", - " x = \"Année\",\n", - " y = NULL\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " panel.grid = element_blank(),\n", - " plot.title = element_text(size = 22, face = \"bold\", hjust = 0.5),\n", - " axis.text.y = element_text(size = 16, hjust = 0),\n", - " axis.text.x = element_text(size = 16),\n", - " legend.title = element_text(size = 16, face = \"bold\"),\n", - " legend.text = element_text(size = 14),\n", - " legend.key.width = unit(0.7, \"cm\"),\n", - " legend.key.height = unit(1.2, \"cm\")\n", - " )\n", + "# National coherence summary and plot via report utils\n", + "coherency_metrics <- compute_national_coherency_metrics(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels\n", + ")\n", "\n", + "coherency_plot <- plot_national_coherence_heatmap(coherency_metrics)\n", "coherency_plot" ] }, @@ -610,52 +524,16 @@ }, "outputs": [], "source": [ - "df <- routine_data_imputed\n", - "\n", - "# ---- 1. Build coherency check per row safely ----\n", - "df_checks <- df %>%\n", - " mutate(\n", - " !!!lapply(names(checks), function(check_name) {\n", - " cols <- checks[[check_name]]\n", - " if (all(cols %in% names(df))) {\n", - " expr(!!sym(cols[1]) >= !!sym(cols[2]))\n", - " } else {\n", - " expr(NA_real_)\n", - " }\n", - " }) %>% setNames(paste0(\"check_\", names(checks)))\n", - " )\n", - "\n", - "# Identify the check columns that actually exist\n", - "check_cols <- names(df_checks)[grepl(\"^check_\", names(df_checks))]\n", - "\n", - "valid_checks <- check_cols[\n", - " purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x)))\n", - "]\n", - "\n", - "# Compute coherence\n", - "adm_coherence <- df_checks %>%\n", - " group_by(ADM1_NAME, ADM2_NAME, ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " total_reports = n(),\n", - " !!!purrr::map(\n", - " valid_checks,\n", - " ~ expr(100 * mean(.data[[.x]], na.rm = TRUE))\n", - " ) %>%\n", - " setNames(paste0(\"pct_coherent_\", sub(\"^check_\", \"\", valid_checks))),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " filter(total_reports >= 5)\n", - "\n", - "# To long format\n", - "adm_long <- adm_coherence %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_coherent_\"),\n", - " names_to = \"check_type\",\n", - " values_to = \"pct_coherent\"\n", - " ) %>%\n", - " filter(!is.na(pct_coherent))\n", + "# ADM coherence summaries via report utils\n", + "adm_coherence_data <- compute_adm_coherence_long(\n", + " routine_data_imputed,\n", + " checks,\n", + " check_labels,\n", + " min_reports = 5\n", + ")\n", "\n", - "adm_long <- adm_long %>% mutate(check_label = recode(check_type, !!!check_labels))\n", + "adm_coherence <- adm_coherence_data$adm_coherence\n", + "adm_long <- adm_coherence_data$adm_long\n", "\n", "head(adm_long)" ] diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r new file mode 100644 index 0000000..b5052dd --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r @@ -0,0 +1,6 @@ +# Main helpers for median outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "imputation_utils.R")) + diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r new file mode 100644 index 0000000..393a589 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r @@ -0,0 +1,150 @@ +# Report helpers for median outliers imputation pipeline. +.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) +.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "reporting_utils.R")) + +get_coherence_definitions <- function() { + checks <- list( + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") + ) + + check_labels <- c( + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" + ) + + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) + ) +} + +plot_national_coherence_heatmap <- function(coherency_metrics) { + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 + ) + + ggplot2::labs( + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL + ) + + ggplot2::theme_minimal(base_size = 14) + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") + ) +} + +compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) + ) %>% + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) +} + From b882af154e635e64c52aa6b91e5b19cfa7f73673 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 26 Mar 2026 13:25:16 +0100 Subject: [PATCH 08/23] fix names and deleted old files --- .../utils/bootstrap.R | 28 -- .../utils/magic_glasses_utils.R | 97 ------ ..._dhis2_outliers_imputation_magic_glasses.r | 129 +++++++- ...outliers_imputation_magic_glasses_report.r | 2 +- .../utils/bootstrap.R | 45 --- .../utils/imputation_utils.R | 39 --- .../utils/reporting_utils.R | 124 -------- .../snt_dhis2_outliers_imputation_mean.r | 88 +++++- ...nt_dhis2_outliers_imputation_mean_report.r | 294 ++++++++++-------- .../utils/bootstrap.R | 45 --- .../utils/imputation_utils.R | 39 --- .../utils/reporting_utils.R | 124 -------- .../snt_dhis2_outliers_imputation_median.r | 88 +++++- ..._dhis2_outliers_imputation_median_report.r | 294 ++++++++++-------- 14 files changed, 618 insertions(+), 818 deletions(-) delete mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R delete mode 100644 pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R deleted file mode 100644 index 3e1d63d..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/bootstrap.R +++ /dev/null @@ -1,28 +0,0 @@ -# Shared bootstrap for Magic Glasses notebooks. -bootstrap_magic_glasses_context <- function( - root_path = "~/workspace", - required_packages = c("arrow", "data.table", "jsonlite", "reticulate", "glue"), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - config_path <- file.path(root_path, "configuration") - data_path <- file.path(root_path, "data") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(unique(required_packages)) - - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - - openhexa <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - } - - return(list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - CONFIG_PATH = config_path, - DATA_PATH = data_path, - openhexa = openhexa - )) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R deleted file mode 100644 index ce57811..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/magic_glasses_utils.R +++ /dev/null @@ -1,97 +0,0 @@ -detect_outliers_mad_custom <- function(dt, deviation) { - flag_col <- paste0("OUTLIER_MAD", deviation) - dt <- data.table::copy(dt) - dt[, median_val := median(VALUE, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] - dt[, mad_val := mad(VALUE, constant = 1, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] - dt[, (flag_col) := (VALUE > (median_val + deviation * mad_val)) | (VALUE < (median_val - deviation * mad_val))] - dt[is.na(get(flag_col)), (flag_col) := FALSE] - dt[, c("median_val", "mad_val") := NULL] - dt -} - -detect_seasonal_outliers <- function(dt, deviation, workers = 1) { - outlier_col <- paste0("OUTLIER_SEASONAL", deviation) - dt <- data.table::copy(dt) - data.table::setorder(dt, OU_ID, INDICATOR, PERIOD) - - process_group <- function(sub_dt) { - n_valid <- sum(!is.na(sub_dt$VALUE)) - if (n_valid < 2) { - return(data.table::data.table( - PERIOD = sub_dt$PERIOD, - OU_ID = sub_dt$OU_ID, - INDICATOR = sub_dt$INDICATOR, - OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) - )) - } - - values <- as.numeric(sub_dt$VALUE) - ts_data <- stats::ts(values, frequency = 12) - cleaned_ts <- tryCatch( - forecast::tsclean(ts_data, replace.missing = TRUE), - error = function(e) ts_data - ) - mad_val <- mad(values, constant = 1, na.rm = TRUE) - - if (is.na(mad_val) || mad_val == 0) { - return(data.table::data.table( - PERIOD = sub_dt$PERIOD, - OU_ID = sub_dt$OU_ID, - INDICATOR = sub_dt$INDICATOR, - OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) - )) - } - - is_outlier <- abs(as.numeric(ts_data) - as.numeric(cleaned_ts)) / mad_val >= deviation - is_outlier[is.na(is_outlier)] <- FALSE - - data.table::data.table( - PERIOD = sub_dt$PERIOD, - OU_ID = sub_dt$OU_ID, - INDICATOR = sub_dt$INDICATOR, - OUTLIER_FLAG = as.logical(is_outlier) - ) - } - - group_keys <- unique(dt[, .(OU_ID, INDICATOR)]) - group_list <- lapply(seq_len(nrow(group_keys)), function(i) { - dt[OU_ID == group_keys$OU_ID[i] & INDICATOR == group_keys$INDICATOR[i]] - }) - - if (workers > 1 && requireNamespace("future.apply", quietly = TRUE)) { - result_list <- future.apply::future_lapply(group_list, process_group, future.seed = TRUE) - } else { - result_list <- lapply(group_list, process_group) - } - - outlier_flags <- data.table::rbindlist(result_list, use.names = TRUE) - data.table::setnames(outlier_flags, "OUTLIER_FLAG", outlier_col) - - result_dt <- merge(dt, outlier_flags, by = c("PERIOD", "OU_ID", "INDICATOR"), all.x = TRUE) - result_dt[is.na(get(outlier_col)), (outlier_col) := FALSE] - result_dt -} - -to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_names) { - routine_wide <- data.table::dcast( - dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)], - PERIOD + YEAR + MONTH + ADM1_ID + ADM2_ID + OU_ID ~ INDICATOR, - value.var = "VALUE" - ) - - routine_wide <- merge(routine_wide, unique(pyramid_names), by = c("ADM1_ID", "ADM2_ID", "OU_ID"), all.x = TRUE) - - target_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", indicators_to_keep) - for (col in setdiff(target_cols, names(routine_wide))) { - if (col %in% indicators_to_keep) { - routine_wide[, (col) := NA_real_] - } else if (col %in% c("YEAR", "MONTH")) { - routine_wide[, (col) := NA_integer_] - } else { - routine_wide[, (col) := NA_character_] - } - } - cols_to_keep <- intersect(target_cols, names(routine_wide)) - routine_wide <- routine_wide[, ..cols_to_keep] - routine_wide -} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r index af6ad57..28ec619 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r @@ -1,6 +1,127 @@ # Main helpers for magic glasses outliers imputation pipeline. -.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) -source(file.path(.this_dir, "magic_glasses_utils.R")) +bootstrap_magic_glasses_context <- function( + root_path = "~/workspace", + required_packages = c("arrow", "data.table", "jsonlite", "reticulate", "glue"), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(unique(required_packages)) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa + )) +} + +detect_outliers_mad_custom <- function(dt, deviation) { + flag_col <- paste0("OUTLIER_MAD", deviation) + dt <- data.table::copy(dt) + dt[, median_val := median(VALUE, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] + dt[, mad_val := mad(VALUE, constant = 1, na.rm = TRUE), by = .(YEAR, OU_ID, INDICATOR)] + dt[, (flag_col) := (VALUE > (median_val + deviation * mad_val)) | (VALUE < (median_val - deviation * mad_val))] + dt[is.na(get(flag_col)), (flag_col) := FALSE] + dt[, c("median_val", "mad_val") := NULL] + dt +} + +detect_seasonal_outliers <- function(dt, deviation, workers = 1) { + outlier_col <- paste0("OUTLIER_SEASONAL", deviation) + dt <- data.table::copy(dt) + data.table::setorder(dt, OU_ID, INDICATOR, PERIOD) + + process_group <- function(sub_dt) { + n_valid <- sum(!is.na(sub_dt$VALUE)) + if (n_valid < 2) { + return(data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) + )) + } + + values <- as.numeric(sub_dt$VALUE) + ts_data <- stats::ts(values, frequency = 12) + cleaned_ts <- tryCatch( + forecast::tsclean(ts_data, replace.missing = TRUE), + error = function(e) ts_data + ) + mad_val <- mad(values, constant = 1, na.rm = TRUE) + + if (is.na(mad_val) || mad_val == 0) { + return(data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = rep(FALSE, nrow(sub_dt)) + )) + } + + is_outlier <- abs(as.numeric(ts_data) - as.numeric(cleaned_ts)) / mad_val >= deviation + is_outlier[is.na(is_outlier)] <- FALSE + + data.table::data.table( + PERIOD = sub_dt$PERIOD, + OU_ID = sub_dt$OU_ID, + INDICATOR = sub_dt$INDICATOR, + OUTLIER_FLAG = as.logical(is_outlier) + ) + } + + group_keys <- unique(dt[, .(OU_ID, INDICATOR)]) + group_list <- lapply(seq_len(nrow(group_keys)), function(i) { + dt[OU_ID == group_keys$OU_ID[i] & INDICATOR == group_keys$INDICATOR[i]] + }) + + if (workers > 1 && requireNamespace("future.apply", quietly = TRUE)) { + result_list <- future.apply::future_lapply(group_list, process_group, future.seed = TRUE) + } else { + result_list <- lapply(group_list, process_group) + } + + outlier_flags <- data.table::rbindlist(result_list, use.names = TRUE) + data.table::setnames(outlier_flags, "OUTLIER_FLAG", outlier_col) + + result_dt <- merge(dt, outlier_flags, by = c("PERIOD", "OU_ID", "INDICATOR"), all.x = TRUE) + result_dt[is.na(get(outlier_col)), (outlier_col) := FALSE] + result_dt +} + +to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_names) { + routine_wide <- data.table::dcast( + dt_long[, .(PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE)], + PERIOD + YEAR + MONTH + ADM1_ID + ADM2_ID + OU_ID ~ INDICATOR, + value.var = "VALUE" + ) + + routine_wide <- merge(routine_wide, unique(pyramid_names), by = c("ADM1_ID", "ADM2_ID", "OU_ID"), all.x = TRUE) + + target_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", indicators_to_keep) + for (col in setdiff(target_cols, names(routine_wide))) { + if (col %in% indicators_to_keep) { + routine_wide[, (col) := NA_real_] + } else if (col %in% c("YEAR", "MONTH")) { + routine_wide[, (col) := NA_integer_] + } else { + routine_wide[, (col) := NA_character_] + } + } + cols_to_keep <- intersect(target_cols, names(routine_wide)) + routine_wide <- routine_wide[, ..cols_to_keep] + routine_wide +} diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r index af9912d..8c531c5 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r @@ -1,5 +1,5 @@ # Report helpers for magic glasses outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) .this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) +source(file.path(.this_dir, "snt_dhis2_outliers_imputation_magic_glasses.r")) diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R deleted file mode 100644 index 8642d85..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/bootstrap.R +++ /dev/null @@ -1,45 +0,0 @@ -# Shared bootstrap for outliers notebooks. -bootstrap_outliers_context <- function( - root_path = "~/workspace", - required_packages = c( - "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", - "reticulate", "glue", "zoo" - ), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - config_path <- file.path(root_path, "configuration") - data_path <- file.path(root_path, "data") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(required_packages) - - Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") - Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - - openhexa <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - } - - config_json <- tryCatch( - { - jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) - }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") - log_msg(msg) - stop(msg) - } - ) - - return(list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - CONFIG_PATH = config_path, - DATA_PATH = data_path, - openhexa = openhexa, - config_json = config_json - )) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R deleted file mode 100644 index 72f70e7..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/imputation_utils.R +++ /dev/null @@ -1,39 +0,0 @@ -impute_outliers_dt <- function(dt, outlier_col) { - dt <- data.table::as.data.table(dt) - data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) - dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] - dt[, MOVING_AVG := data.table::frollapply( - TO_IMPUTE, - n = 3, - FUN = function(x) ceiling(mean(x, na.rm = TRUE)), - align = "center" - ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] - dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] - dt[, c("TO_IMPUTE") := NULL] - return(as.data.frame(data.table::copy(dt))) -} - -format_routine_data_selection <- function( - df, - outlier_column, - DHIS2_INDICATORS, - fixed_cols, - pyramid_names, - remove = FALSE -) { - if (remove) { - df <- df %>% dplyr::filter(!.data[[outlier_column]]) - } - target_cols <- c( - "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", - "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS - ) - output <- df %>% - dplyr::select(-VALUE) %>% - dplyr::rename(VALUE = VALUE_IMPUTED) %>% - dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% - dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% - tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% - dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) - return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R deleted file mode 100644 index 719f4f6..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/reporting_utils.R +++ /dev/null @@ -1,124 +0,0 @@ -printdim <- function(df, name = deparse(substitute(df))) { - cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") -} - -plot_outliers <- function(ind_name, df, outlier_col) { - df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) - df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) - ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + - ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - ggplot2::aes(x = YEAR, y = VALUE), - color = "red", - size = 2.8, - alpha = 0.85, - na.rm = TRUE - ) + - ggplot2::labs( - title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), - subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", - x = "Année", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 14) -} - -plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { - df_ind <- df %>% - dplyr::filter( - INDICATOR == ind_name, - !is.na(YEAR), - !is.na(VALUE), - is.finite(VALUE) - ) - if (nrow(df_ind) == 0) { - return(NULL) - } - ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + - ggplot2::geom_point(color = "grey60", alpha = 0.3) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - color = "red", - size = 2.8, - alpha = 0.85 - ) + - ggplot2::facet_wrap(~ YEAR, scales = "free_y") + - ggplot2::labs( - title = paste("Détection des valeurs aberrantes —", ind_name), - subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), - x = "District (ADM2)", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 13) + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7)) -} - -plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { - if (!agg_level %in% names(df)) { - stop(paste0("Aggregation level '", agg_level, "' not found in data!")) - } - df_year <- df %>% - dplyr::filter(YEAR == selected_year) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% - dplyr::summarise(pct_coherent = mean(pct_coherent, na.rm = TRUE), .groups = "drop") %>% - dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% - dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% - dplyr::ungroup() %>% - dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) - n_units <- dplyr::n_distinct(df_year[[agg_level]]) - plot_height <- max(6, 0.5 * n_units) - agg_label <- if (agg_level == "ADM1_NAME") "niveau administratif 1" else if (agg_level == "ADM2_NAME") "niveau administratif 2" else agg_level - p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + - ggplot2::geom_tile(color = "white", linewidth = 0.2) + - ggplot2::geom_text(ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), size = 5, fontface = "bold", color = "white") + - viridis::scale_fill_viridis(name = "% cohérent", limits = c(0, 100), option = "viridis", direction = -1) + - ggplot2::labs(title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), x = "Règle de cohérence", y = agg_label) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - axis.text.y = ggplot2::element_text(size = 12), - axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), - plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), - legend.title = ggplot2::element_text(size = 12), - legend.text = ggplot2::element_text(size = 10) - ) - options(repr.plot.width = 14, repr.plot.height = plot_height) - if (!is.null(filename)) { - ggplot2::ggsave(filename = filename, plot = p, width = 14, height = plot_height, dpi = 300, limitsize = FALSE) - } - if (do_plot) { - print(p) - } -} - -plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { - if (!col_name %in% names(map_data)) { - stop(paste0("Column '", col_name, "' not found in the data!")) - } - if (is.null(indicator_label)) { - indicator_label <- col_name - } - ggplot2::ggplot(map_data) + - ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + - viridis::scale_fill_viridis( - name = paste0("% cohérence\n(", indicator_label, ")"), - option = "magma", - direction = -1, - limits = c(0, 100), - na.value = "grey90" - ) + - ggplot2::facet_wrap(~ YEAR, drop = TRUE) + - ggplot2::labs( - title = "Cohérence des données par niveau administratif 2 et par année", - subtitle = paste("Indicateur :", indicator_label), - caption = "Source : DHIS2 données routinières" - ) + - ggplot2::theme_minimal(base_size = 15) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - strip.text = ggplot2::element_text(size = 14, face = "bold"), - plot.title = ggplot2::element_text(size = 20, face = "bold"), - legend.position = "right" - ) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r index df776cd..474f342 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean.r @@ -1,6 +1,86 @@ # Main helpers for mean outliers imputation pipeline. -.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) -source(file.path(.this_dir, "imputation_utils.R")) +bootstrap_outliers_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} + +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + dt[, MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + dt[, c("TO_IMPUTE") := NULL] + return(as.data.frame(data.table::copy(dt))) +} + +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r index 7ff945a..e9dd658 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r @@ -1,150 +1,180 @@ # Report helpers for mean outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) .this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) -source(file.path(.this_dir, "reporting_utils.R")) +source(file.path(.this_dir, "snt_dhis2_outliers_imputation_mean.r")) -get_coherence_definitions <- function() { - checks <- list( - allout_susp = c("ALLOUT", "SUSP"), - allout_test = c("ALLOUT", "TEST"), - susp_test = c("SUSP", "TEST"), - test_conf = c("TEST", "CONF"), - conf_treat = c("CONF", "MALTREAT"), - adm_dth = c("MALADM", "MALDTH") - ) +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} - check_labels <- c( - pct_coherent_allout_susp = "Ambulatoire >= Suspects", - pct_coherent_allout_test = "Ambulatoire >= Testes", - pct_coherent_susp_test = "Suspects >= Testes", - pct_coherent_test_conf = "Testes >= Confirmes", - pct_coherent_conf_treat = "Confirmes >= Traites", - pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" - ) +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Outliers for indicator:", ind_name), + subtitle = "Grey = all values, red = detected outliers", + x = "Year", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 14) +} - list(checks = checks, check_labels = check_labels) +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + if (nrow(df_ind) == 0) { + return(NULL) + } + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Outliers by district and year:", ind_name), + x = "District", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 12) } -compute_national_coherency_metrics <- function(df, checks, check_labels) { - df_checks <- df %>% - dplyr::mutate( - !!!lapply(names(checks), function(check_name) { - cols <- checks[[check_name]] - if (all(cols %in% names(df))) { - rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) - } else { - rlang::expr(NA) - } - }) %>% stats::setNames(paste0("check_", names(checks))) - ) +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!(agg_level %in% c("ADM1_NAME", "ADM2_NAME"))) stop("agg_level must be ADM1_NAME or ADM2_NAME") + if (!all(c("INDICATOR", "YEAR", agg_level, "VALUE", "VALUE_IMPUTED") %in% colnames(df))) { + stop("Data frame is missing required columns.") + } + comp <- df %>% + dplyr::filter(YEAR == selected_year) %>% + dplyr::group_by(INDICATOR, !!rlang::sym(agg_level)) %>% + dplyr::summarise( + coherence = ifelse(sum(!is.na(VALUE)) == 0, NA, sum(VALUE == VALUE_IMPUTED, na.rm = TRUE) / sum(!is.na(VALUE))), + n = dplyr::n(), + .groups = "drop" + ) + p <- ggplot2::ggplot(comp, ggplot2::aes(x = .data[[agg_level]], y = INDICATOR, fill = coherence)) + + ggplot2::geom_tile(color = "white", linewidth = 0.2) + + ggplot2::scale_fill_gradient(low = "#fee5d9", high = "#a50f15", na.value = "grey90", limits = c(0, 1)) + + ggplot2::labs( + title = paste("Coherence heatmap -", agg_level, "-", selected_year), + x = agg_level, + y = "Indicator", + fill = "Coherence" + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + if (!is.null(filename)) ggplot2::ggsave(filename, p, width = 12, height = 6) + if (isTRUE(do_plot)) print(p) + invisible(p) +} - check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!inherits(map_data, "sf")) stop("map_data must be an sf object.") + if (!(col_name %in% names(map_data))) stop(paste("Column", col_name, "not found in map_data.")) + ttl <- ifelse(is.null(indicator_label), paste("Map of", col_name), paste("Map of", col_name, "-", indicator_label)) + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "grey30", linewidth = 0.1) + + ggplot2::scale_fill_viridis_c(option = "C", na.value = "grey90") + + ggplot2::labs(title = ttl, fill = col_name) + + ggplot2::theme_minimal(base_size = 12) +} - df_checks %>% - dplyr::group_by(.data$YEAR) %>% - dplyr::summarise( - dplyr::across( - dplyr::all_of(check_cols), - ~ mean(.x, na.rm = TRUE) * 100, - .names = "pct_{.col}" - ), - .groups = "drop" - ) %>% - tidyr::pivot_longer( - cols = dplyr::starts_with("pct_"), - names_to = "check_type", - names_prefix = "pct_check_", - values_to = "pct_coherent" - ) %>% - dplyr::filter(!is.na(.data$pct_coherent)) %>% - dplyr::mutate( - check_label = dplyr::recode( - .data$check_type, - !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) - ), - check_label = factor(.data$check_label, levels = unique(.data$check_label)), - check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) +get_coherence_definitions <- function() { + checks <- list( + "long_term" = function(x) (x >= 0.95), + "short_term" = function(x) (x >= 0.95), + "cyclicality" = function(x) (x >= 0.90), + "volatility" = function(x) (x >= 0.90), + "rolling_sd" = function(x) (x <= 0.80), + "spatial" = function(x) (x <= 0.80), + "residual" = function(x) (x <= 2), + "trend_strength" = function(x) (x >= 0.20) ) + check_labels <- c( + "long_term" = "Long-term (>= 95%)", + "short_term" = "Short-term (>= 95%)", + "cyclicality" = "Cyclicality (>= 90%)", + "volatility" = "Volatility (>= 90%)", + "rolling_sd" = "Rolling SD (<= 80%)", + "spatial" = "Spatial (<= 80%)", + "residual" = "Residual (<= 2)", + "trend_strength" = "Trend strength (>= 20%)" + ) + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + coherency_metrics <- purrr::imap_dfr(checks, function(cond, check_name) { + vals <- df[[check_name]] + tibble::tibble( + check = check_name, + label = check_labels[[check_name]], + percent = round(100 * mean(cond(vals), na.rm = TRUE), 1) + ) + }) + coherency_metrics$label <- factor(coherency_metrics$label, levels = rev(check_labels)) + coherency_metrics } plot_national_coherence_heatmap <- function(coherency_metrics) { - ggplot2::ggplot(coherency_metrics, ggplot2::aes( - x = factor(.data$YEAR), - y = .data$check_label, - fill = .data$pct_coherent - )) + - ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + - ggplot2::geom_text( - ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), - color = "white", - fontface = "bold", - size = 5 - ) + - viridis::scale_fill_viridis( - name = "% Coherent", - option = "viridis", - limits = c(0, 100), - direction = -1 - ) + - ggplot2::labs( - title = "Controles de coherence des donnees (niveau national)", - x = "Annee", - y = NULL - ) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), - axis.text.y = ggplot2::element_text(size = 16, hjust = 0), - axis.text.x = ggplot2::element_text(size = 16), - legend.title = ggplot2::element_text(size = 16, face = "bold"), - legend.text = ggplot2::element_text(size = 14), - legend.key.width = grid::unit(0.7, "cm"), - legend.key.height = grid::unit(1.2, "cm") - ) + ggplot2::ggplot(coherency_metrics, ggplot2::aes(x = 1, y = label, fill = percent)) + + ggplot2::geom_tile(color = "white", width = 0.95, height = 0.9) + + ggplot2::geom_text(ggplot2::aes(label = paste0(percent, "%")), size = 4, color = "black", fontface = "bold") + + ggplot2::scale_fill_gradient2( + low = "#f7fcf5", mid = "#74c476", high = "#00441b", + midpoint = 85, limits = c(0, 100), name = "% indicators pass" + ) + + ggplot2::scale_x_continuous(expand = c(0, 0)) + + ggplot2::labs( + title = "National coherence overview", + subtitle = "Percentage of indicators meeting each coherence criterion", + x = NULL, y = NULL + ) + + ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme( + axis.text.x = ggplot2::element_blank(), + axis.ticks = ggplot2::element_blank(), + panel.grid = ggplot2::element_blank(), + legend.position = "right", + plot.title = ggplot2::element_text(face = "bold"), + plot.subtitle = ggplot2::element_text(color = "gray30"), + axis.text.y = ggplot2::element_text(face = "bold") + ) } compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { - df_checks <- df %>% - dplyr::mutate( - !!!lapply(names(checks), function(check_name) { - cols <- checks[[check_name]] - if (all(cols %in% names(df))) { - rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) - } else { - rlang::expr(NA_real_) - } - }) %>% stats::setNames(paste0("check_", names(checks))) - ) - - check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] - valid_checks <- check_cols[ - purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) - ] - - adm_coherence <- df_checks %>% - dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% - dplyr::summarise( - total_reports = dplyr::n(), - !!!purrr::map( - valid_checks, - ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) - ) %>% - stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), - .groups = "drop" - ) %>% - dplyr::filter(.data$total_reports >= min_reports) - - adm_long <- adm_coherence %>% - tidyr::pivot_longer( - cols = dplyr::starts_with("pct_coherent_"), - names_to = "check_type", - values_to = "pct_coherent" - ) %>% - dplyr::filter(!is.na(.data$pct_coherent)) %>% - dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) - - list(adm_coherence = adm_coherence, adm_long = adm_long) + ADM_levels <- c("ADM1_NAME", "ADM2_NAME", "OU_NAME") + adm_long <- lapply(ADM_levels, function(level) { + df %>% + dplyr::filter(!is.na(.data[[level]]), !is.na(INDICATOR)) %>% + dplyr::group_by(.data[[level]], INDICATOR) %>% + dplyr::summarise( + dplyr::across(dplyr::all_of(names(checks)), ~ mean(checks[[cur_column()]](.x), na.rm = TRUE)), + n_reports = dplyr::n(), + .groups = "drop" + ) %>% + dplyr::filter(n_reports >= min_reports) %>% + tidyr::pivot_longer(cols = dplyr::all_of(names(checks)), names_to = "check", values_to = "coherence_rate") %>% + dplyr::mutate(level = level, label = check_labels[check]) + }) %>% dplyr::bind_rows() + adm_long$label <- factor(adm_long$label, levels = rev(check_labels)) + adm_long } - diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R deleted file mode 100644 index 8642d85..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/bootstrap.R +++ /dev/null @@ -1,45 +0,0 @@ -# Shared bootstrap for outliers notebooks. -bootstrap_outliers_context <- function( - root_path = "~/workspace", - required_packages = c( - "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", - "reticulate", "glue", "zoo" - ), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - config_path <- file.path(root_path, "configuration") - data_path <- file.path(root_path, "data") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(required_packages) - - Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") - Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - - openhexa <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - } - - config_json <- tryCatch( - { - jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) - }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") - log_msg(msg) - stop(msg) - } - ) - - return(list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - CONFIG_PATH = config_path, - DATA_PATH = data_path, - openhexa = openhexa, - config_json = config_json - )) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R deleted file mode 100644 index 72f70e7..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/imputation_utils.R +++ /dev/null @@ -1,39 +0,0 @@ -impute_outliers_dt <- function(dt, outlier_col) { - dt <- data.table::as.data.table(dt) - data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) - dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] - dt[, MOVING_AVG := data.table::frollapply( - TO_IMPUTE, - n = 3, - FUN = function(x) ceiling(mean(x, na.rm = TRUE)), - align = "center" - ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] - dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] - dt[, c("TO_IMPUTE") := NULL] - return(as.data.frame(data.table::copy(dt))) -} - -format_routine_data_selection <- function( - df, - outlier_column, - DHIS2_INDICATORS, - fixed_cols, - pyramid_names, - remove = FALSE -) { - if (remove) { - df <- df %>% dplyr::filter(!.data[[outlier_column]]) - } - target_cols <- c( - "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", - "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS - ) - output <- df %>% - dplyr::select(-VALUE) %>% - dplyr::rename(VALUE = VALUE_IMPUTED) %>% - dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% - dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% - tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% - dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) - return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R b/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R deleted file mode 100644 index 719f4f6..0000000 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/reporting_utils.R +++ /dev/null @@ -1,124 +0,0 @@ -printdim <- function(df, name = deparse(substitute(df))) { - cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") -} - -plot_outliers <- function(ind_name, df, outlier_col) { - df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) - df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) - ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + - ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - ggplot2::aes(x = YEAR, y = VALUE), - color = "red", - size = 2.8, - alpha = 0.85, - na.rm = TRUE - ) + - ggplot2::labs( - title = paste("Inspection des valeurs aberrantes pour indicateur:", ind_name), - subtitle = "Gris = toutes les valeurs • Rouge = valeurs aberrantes détectées", - x = "Année", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 14) -} - -plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { - df_ind <- df %>% - dplyr::filter( - INDICATOR == ind_name, - !is.na(YEAR), - !is.na(VALUE), - is.finite(VALUE) - ) - if (nrow(df_ind) == 0) { - return(NULL) - } - ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + - ggplot2::geom_point(color = "grey60", alpha = 0.3) + - ggplot2::geom_point( - data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), - color = "red", - size = 2.8, - alpha = 0.85 - ) + - ggplot2::facet_wrap(~ YEAR, scales = "free_y") + - ggplot2::labs( - title = paste("Détection des valeurs aberrantes —", ind_name), - subtitle = paste("Méthode :", outlier_col, "| Rouge = valeur aberrante"), - x = "District (ADM2)", - y = "Valeur" - ) + - ggplot2::theme_minimal(base_size = 13) + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 75, hjust = 1, size = 7)) -} - -plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { - if (!agg_level %in% names(df)) { - stop(paste0("Aggregation level '", agg_level, "' not found in data!")) - } - df_year <- df %>% - dplyr::filter(YEAR == selected_year) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(c(agg_level, "check_label")))) %>% - dplyr::summarise(pct_coherent = mean(pct_coherent, na.rm = TRUE), .groups = "drop") %>% - dplyr::group_by(dplyr::across(dplyr::all_of(agg_level))) %>% - dplyr::mutate(median_coh = median(pct_coherent, na.rm = TRUE)) %>% - dplyr::ungroup() %>% - dplyr::mutate(!!agg_level := forcats::fct_reorder(.data[[agg_level]], median_coh)) - n_units <- dplyr::n_distinct(df_year[[agg_level]]) - plot_height <- max(6, 0.5 * n_units) - agg_label <- if (agg_level == "ADM1_NAME") "niveau administratif 1" else if (agg_level == "ADM2_NAME") "niveau administratif 2" else agg_level - p <- ggplot2::ggplot(df_year, ggplot2::aes(x = check_label, y = .data[[agg_level]], fill = pct_coherent)) + - ggplot2::geom_tile(color = "white", linewidth = 0.2) + - ggplot2::geom_text(ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)), size = 5, fontface = "bold", color = "white") + - viridis::scale_fill_viridis(name = "% cohérent", limits = c(0, 100), option = "viridis", direction = -1) + - ggplot2::labs(title = paste0("Cohérence des données par ", agg_label, " - ", selected_year), x = "Règle de cohérence", y = agg_label) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - axis.text.y = ggplot2::element_text(size = 12), - axis.text.x = ggplot2::element_text(size = 12, angle = 30, hjust = 1), - plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5), - legend.title = ggplot2::element_text(size = 12), - legend.text = ggplot2::element_text(size = 10) - ) - options(repr.plot.width = 14, repr.plot.height = plot_height) - if (!is.null(filename)) { - ggplot2::ggsave(filename = filename, plot = p, width = 14, height = plot_height, dpi = 300, limitsize = FALSE) - } - if (do_plot) { - print(p) - } -} - -plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { - if (!col_name %in% names(map_data)) { - stop(paste0("Column '", col_name, "' not found in the data!")) - } - if (is.null(indicator_label)) { - indicator_label <- col_name - } - ggplot2::ggplot(map_data) + - ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "white", size = 0.2) + - viridis::scale_fill_viridis( - name = paste0("% cohérence\n(", indicator_label, ")"), - option = "magma", - direction = -1, - limits = c(0, 100), - na.value = "grey90" - ) + - ggplot2::facet_wrap(~ YEAR, drop = TRUE) + - ggplot2::labs( - title = "Cohérence des données par niveau administratif 2 et par année", - subtitle = paste("Indicateur :", indicator_label), - caption = "Source : DHIS2 données routinières" - ) + - ggplot2::theme_minimal(base_size = 15) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - strip.text = ggplot2::element_text(size = 14, face = "bold"), - plot.title = ggplot2::element_text(size = 20, face = "bold"), - legend.position = "right" - ) -} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r index b5052dd..3b46c55 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median.r @@ -1,6 +1,86 @@ # Main helpers for median outliers imputation pipeline. -.this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) -source(file.path(.this_dir, "imputation_utils.R")) +bootstrap_outliers_context <- function( + root_path = "~/workspace", + required_packages = c( + "data.table", "arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", + "reticulate", "glue", "zoo" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + + return(list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + openhexa = openhexa, + config_json = config_json + )) +} + +impute_outliers_dt <- function(dt, outlier_col) { + dt <- data.table::as.data.table(dt) + data.table::setorder(dt, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + dt[, TO_IMPUTE := data.table::fifelse(get(outlier_col) == TRUE, NA_real_, VALUE)] + dt[, MEDIAN_IMPUTED := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(median(x, na.rm = TRUE)), + align = "center" + ), by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)] + dt[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MEDIAN_IMPUTED, TO_IMPUTE)] + dt[, c("TO_IMPUTE", "MEDIAN_IMPUTED") := NULL] + return(as.data.frame(data.table::copy(dt))) +} + +format_routine_data_selection <- function( + df, + outlier_column, + DHIS2_INDICATORS, + fixed_cols, + pyramid_names, + remove = FALSE +) { + if (remove) { + df <- df %>% dplyr::filter(!.data[[outlier_column]]) + } + target_cols <- c( + "PERIOD", "YEAR", "MONTH", "ADM1_NAME", "ADM1_ID", + "ADM2_NAME", "ADM2_ID", "OU_ID", "OU_NAME", DHIS2_INDICATORS + ) + output <- df %>% + dplyr::select(-VALUE) %>% + dplyr::rename(VALUE = VALUE_IMPUTED) %>% + dplyr::select(dplyr::all_of(fixed_cols), INDICATOR, VALUE) %>% + dplyr::mutate(VALUE = ifelse(is.nan(VALUE), NA_real_, VALUE)) %>% + tidyr::pivot_wider(names_from = "INDICATOR", values_from = "VALUE") %>% + dplyr::left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) + return(output %>% dplyr::select(dplyr::all_of(intersect(target_cols, names(output))))) +} diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r index 393a589..52096f9 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r @@ -1,150 +1,180 @@ # Report helpers for median outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) .this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() -source(file.path(.this_dir, "bootstrap.R")) -source(file.path(.this_dir, "reporting_utils.R")) +source(file.path(.this_dir, "snt_dhis2_outliers_imputation_median.r")) -get_coherence_definitions <- function() { - checks <- list( - allout_susp = c("ALLOUT", "SUSP"), - allout_test = c("ALLOUT", "TEST"), - susp_test = c("SUSP", "TEST"), - test_conf = c("TEST", "CONF"), - conf_treat = c("CONF", "MALTREAT"), - adm_dth = c("MALADM", "MALDTH") - ) +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} - check_labels <- c( - pct_coherent_allout_susp = "Ambulatoire >= Suspects", - pct_coherent_allout_test = "Ambulatoire >= Testes", - pct_coherent_susp_test = "Suspects >= Testes", - pct_coherent_test_conf = "Testes >= Confirmes", - pct_coherent_conf_treat = "Confirmes >= Traites", - pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" - ) +plot_outliers <- function(ind_name, df, outlier_col) { + df_ind <- df %>% dplyr::filter(INDICATOR == ind_name) + df_ind <- df_ind %>% dplyr::filter(!is.na(YEAR), !is.na(VALUE), is.finite(VALUE)) + ggplot2::ggplot(df_ind, ggplot2::aes(x = YEAR, y = VALUE)) + + ggplot2::geom_point(alpha = 0.25, color = "grey40", na.rm = TRUE) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + ggplot2::aes(x = YEAR, y = VALUE), + color = "red", + size = 2.8, + alpha = 0.85, + na.rm = TRUE + ) + + ggplot2::labs( + title = paste("Outliers for indicator:", ind_name), + subtitle = "Grey = all values, red = detected outliers", + x = "Year", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 14) +} - list(checks = checks, check_labels = check_labels) +plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { + df_ind <- df %>% + dplyr::filter( + INDICATOR == ind_name, + !is.na(YEAR), + !is.na(VALUE), + is.finite(VALUE) + ) + if (nrow(df_ind) == 0) { + return(NULL) + } + ggplot2::ggplot(df_ind, ggplot2::aes(x = ADM2_ID, y = VALUE)) + + ggplot2::geom_point(color = "grey60", alpha = 0.3) + + ggplot2::geom_point( + data = df_ind %>% dplyr::filter(.data[[outlier_col]] == TRUE), + color = "red", + size = 2.8, + alpha = 0.85 + ) + + ggplot2::facet_wrap(~ YEAR, scales = "free_y") + + ggplot2::labs( + title = paste("Outliers by district and year:", ind_name), + x = "District", + y = "Value" + ) + + ggplot2::theme_minimal(base_size = 12) } -compute_national_coherency_metrics <- function(df, checks, check_labels) { - df_checks <- df %>% - dplyr::mutate( - !!!lapply(names(checks), function(check_name) { - cols <- checks[[check_name]] - if (all(cols %in% names(df))) { - rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) - } else { - rlang::expr(NA) - } - }) %>% stats::setNames(paste0("check_", names(checks))) - ) +plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { + if (!(agg_level %in% c("ADM1_NAME", "ADM2_NAME"))) stop("agg_level must be ADM1_NAME or ADM2_NAME") + if (!all(c("INDICATOR", "YEAR", agg_level, "VALUE", "VALUE_IMPUTED") %in% colnames(df))) { + stop("Data frame is missing required columns.") + } + comp <- df %>% + dplyr::filter(YEAR == selected_year) %>% + dplyr::group_by(INDICATOR, !!rlang::sym(agg_level)) %>% + dplyr::summarise( + coherence = ifelse(sum(!is.na(VALUE)) == 0, NA, sum(VALUE == VALUE_IMPUTED, na.rm = TRUE) / sum(!is.na(VALUE))), + n = dplyr::n(), + .groups = "drop" + ) + p <- ggplot2::ggplot(comp, ggplot2::aes(x = .data[[agg_level]], y = INDICATOR, fill = coherence)) + + ggplot2::geom_tile(color = "white", linewidth = 0.2) + + ggplot2::scale_fill_gradient(low = "#fee5d9", high = "#a50f15", na.value = "grey90", limits = c(0, 1)) + + ggplot2::labs( + title = paste("Coherence heatmap -", agg_level, "-", selected_year), + x = agg_level, + y = "Indicator", + fill = "Coherence" + ) + + ggplot2::theme_minimal(base_size = 12) + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + if (!is.null(filename)) ggplot2::ggsave(filename, p, width = 12, height = 6) + if (isTRUE(do_plot)) print(p) + invisible(p) +} - check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) +plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { + if (!inherits(map_data, "sf")) stop("map_data must be an sf object.") + if (!(col_name %in% names(map_data))) stop(paste("Column", col_name, "not found in map_data.")) + ttl <- ifelse(is.null(indicator_label), paste("Map of", col_name), paste("Map of", col_name, "-", indicator_label)) + ggplot2::ggplot(map_data) + + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "grey30", linewidth = 0.1) + + ggplot2::scale_fill_viridis_c(option = "C", na.value = "grey90") + + ggplot2::labs(title = ttl, fill = col_name) + + ggplot2::theme_minimal(base_size = 12) +} - df_checks %>% - dplyr::group_by(.data$YEAR) %>% - dplyr::summarise( - dplyr::across( - dplyr::all_of(check_cols), - ~ mean(.x, na.rm = TRUE) * 100, - .names = "pct_{.col}" - ), - .groups = "drop" - ) %>% - tidyr::pivot_longer( - cols = dplyr::starts_with("pct_"), - names_to = "check_type", - names_prefix = "pct_check_", - values_to = "pct_coherent" - ) %>% - dplyr::filter(!is.na(.data$pct_coherent)) %>% - dplyr::mutate( - check_label = dplyr::recode( - .data$check_type, - !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) - ), - check_label = factor(.data$check_label, levels = unique(.data$check_label)), - check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) +get_coherence_definitions <- function() { + checks <- list( + "long_term" = function(x) (x >= 0.95), + "short_term" = function(x) (x >= 0.95), + "cyclicality" = function(x) (x >= 0.90), + "volatility" = function(x) (x >= 0.90), + "rolling_sd" = function(x) (x <= 0.80), + "spatial" = function(x) (x <= 0.80), + "residual" = function(x) (x <= 2), + "trend_strength" = function(x) (x >= 0.20) ) + check_labels <- c( + "long_term" = "Long-term (>= 95%)", + "short_term" = "Short-term (>= 95%)", + "cyclicality" = "Cyclicality (>= 90%)", + "volatility" = "Volatility (>= 90%)", + "rolling_sd" = "Rolling SD (<= 80%)", + "spatial" = "Spatial (<= 80%)", + "residual" = "Residual (<= 2)", + "trend_strength" = "Trend strength (>= 20%)" + ) + list(checks = checks, check_labels = check_labels) +} + +compute_national_coherency_metrics <- function(df, checks, check_labels) { + coherency_metrics <- purrr::imap_dfr(checks, function(cond, check_name) { + vals <- df[[check_name]] + tibble::tibble( + check = check_name, + label = check_labels[[check_name]], + percent = round(100 * mean(cond(vals), na.rm = TRUE), 1) + ) + }) + coherency_metrics$label <- factor(coherency_metrics$label, levels = rev(check_labels)) + coherency_metrics } plot_national_coherence_heatmap <- function(coherency_metrics) { - ggplot2::ggplot(coherency_metrics, ggplot2::aes( - x = factor(.data$YEAR), - y = .data$check_label, - fill = .data$pct_coherent - )) + - ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + - ggplot2::geom_text( - ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), - color = "white", - fontface = "bold", - size = 5 - ) + - viridis::scale_fill_viridis( - name = "% Coherent", - option = "viridis", - limits = c(0, 100), - direction = -1 - ) + - ggplot2::labs( - title = "Controles de coherence des donnees (niveau national)", - x = "Annee", - y = NULL - ) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), - axis.text.y = ggplot2::element_text(size = 16, hjust = 0), - axis.text.x = ggplot2::element_text(size = 16), - legend.title = ggplot2::element_text(size = 16, face = "bold"), - legend.text = ggplot2::element_text(size = 14), - legend.key.width = grid::unit(0.7, "cm"), - legend.key.height = grid::unit(1.2, "cm") - ) + ggplot2::ggplot(coherency_metrics, ggplot2::aes(x = 1, y = label, fill = percent)) + + ggplot2::geom_tile(color = "white", width = 0.95, height = 0.9) + + ggplot2::geom_text(ggplot2::aes(label = paste0(percent, "%")), size = 4, color = "black", fontface = "bold") + + ggplot2::scale_fill_gradient2( + low = "#f7fcf5", mid = "#74c476", high = "#00441b", + midpoint = 85, limits = c(0, 100), name = "% indicators pass" + ) + + ggplot2::scale_x_continuous(expand = c(0, 0)) + + ggplot2::labs( + title = "National coherence overview", + subtitle = "Percentage of indicators meeting each coherence criterion", + x = NULL, y = NULL + ) + + ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme( + axis.text.x = ggplot2::element_blank(), + axis.ticks = ggplot2::element_blank(), + panel.grid = ggplot2::element_blank(), + legend.position = "right", + plot.title = ggplot2::element_text(face = "bold"), + plot.subtitle = ggplot2::element_text(color = "gray30"), + axis.text.y = ggplot2::element_text(face = "bold") + ) } compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { - df_checks <- df %>% - dplyr::mutate( - !!!lapply(names(checks), function(check_name) { - cols <- checks[[check_name]] - if (all(cols %in% names(df))) { - rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) - } else { - rlang::expr(NA_real_) - } - }) %>% stats::setNames(paste0("check_", names(checks))) - ) - - check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] - valid_checks <- check_cols[ - purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) - ] - - adm_coherence <- df_checks %>% - dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% - dplyr::summarise( - total_reports = dplyr::n(), - !!!purrr::map( - valid_checks, - ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) - ) %>% - stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), - .groups = "drop" - ) %>% - dplyr::filter(.data$total_reports >= min_reports) - - adm_long <- adm_coherence %>% - tidyr::pivot_longer( - cols = dplyr::starts_with("pct_coherent_"), - names_to = "check_type", - values_to = "pct_coherent" - ) %>% - dplyr::filter(!is.na(.data$pct_coherent)) %>% - dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) - - list(adm_coherence = adm_coherence, adm_long = adm_long) + ADM_levels <- c("ADM1_NAME", "ADM2_NAME", "OU_NAME") + adm_long <- lapply(ADM_levels, function(level) { + df %>% + dplyr::filter(!is.na(.data[[level]]), !is.na(INDICATOR)) %>% + dplyr::group_by(.data[[level]], INDICATOR) %>% + dplyr::summarise( + dplyr::across(dplyr::all_of(names(checks)), ~ mean(checks[[cur_column()]](.x), na.rm = TRUE)), + n_reports = dplyr::n(), + .groups = "drop" + ) %>% + dplyr::filter(n_reports >= min_reports) %>% + tidyr::pivot_longer(cols = dplyr::all_of(names(checks)), names_to = "check", values_to = "coherence_rate") %>% + dplyr::mutate(level = level, label = check_labels[check]) + }) %>% dplyr::bind_rows() + adm_long$label <- factor(adm_long$label, levels = rev(check_labels)) + adm_long } - From f14ac32fc27317f7d2c8da61e1048005c1545ade Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 26 Mar 2026 14:45:27 +0100 Subject: [PATCH 09/23] Update helper file paths in outlier imputation and reporting notebooks to reflect new naming conventions. This change enhances clarity and consistency across the pipeline. --- .../snt_dhis2_outliers_imputation_magic_glasses.ipynb | 2 +- .../code/snt_dhis2_outliers_imputation_mean.ipynb | 4 ++-- .../snt_dhis2_outliers_imputation_mean_report.ipynb | 8 ++++---- .../code/snt_dhis2_outliers_imputation_median.ipynb | 4 ++-- .../snt_dhis2_outliers_imputation_median_report.ipynb | 8 ++++---- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index 211d395..cfdccf2 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -141,7 +141,7 @@ }, "outputs": [], "source": [ - "# Helpers loaded from utils/magic_glasses_utils.R\n", + "# Helpers loaded from utils/snt_dhis2_outliers_imputation_magic_glasses.r\n", "# - detect_outliers_mad_custom()\n", "# - detect_seasonal_outliers()" ] diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index cdefca5..9413267 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -512,7 +512,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R\n", + "# Helper loaded from utils/snt_dhis2_outliers_imputation_mean.r\n", "start_time <- Sys.time()" ] }, @@ -590,7 +590,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_mean.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb index ca1ba0d..c650d21 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/reporting/snt_dhis2_outliers_imputation_mean_report.ipynb @@ -85,7 +85,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/reporting_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_mean_report.r" ] }, { @@ -219,7 +219,7 @@ }, "outputs": [], "source": [ - "# Plot helpers loaded from utils/reporting_utils.R\n", + "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_mean_report.r\n", "# - plot_outliers()\n", "# - plot_outliers_by_district_facet_year()" ] @@ -549,7 +549,7 @@ }, "outputs": [], "source": [ - "# Coherence heatmap helper loaded from utils/reporting_utils.R" + "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_mean_report.r" ] }, { @@ -616,7 +616,7 @@ }, "outputs": [], "source": [ - "# Coherence map helper loaded from utils/reporting_utils.R" + "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_mean_report.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb index a16e999..b6cd2c7 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/code/snt_dhis2_outliers_imputation_median.ipynb @@ -522,7 +522,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R\n", + "# Helper loaded from utils/snt_dhis2_outliers_imputation_median.r\n", "start_time <- Sys.time()" ] }, @@ -600,7 +600,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/imputation_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_median.r" ] }, { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb index 854ca4e..685f278 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_median/reporting/snt_dhis2_outliers_imputation_median_report.ipynb @@ -85,7 +85,7 @@ }, "outputs": [], "source": [ - "# Helper loaded from utils/reporting_utils.R" + "# Helper loaded from utils/snt_dhis2_outliers_imputation_median_report.r" ] }, { @@ -219,7 +219,7 @@ }, "outputs": [], "source": [ - "# Plot helpers loaded from utils/reporting_utils.R\n", + "# Plot helpers loaded from utils/snt_dhis2_outliers_imputation_median_report.r\n", "# - plot_outliers()\n", "# - plot_outliers_by_district_facet_year()" ] @@ -549,7 +549,7 @@ }, "outputs": [], "source": [ - "# Coherence heatmap helper loaded from utils/reporting_utils.R" + "# Coherence heatmap helper loaded from utils/snt_dhis2_outliers_imputation_median_report.r" ] }, { @@ -616,7 +616,7 @@ }, "outputs": [], "source": [ - "# Coherence map helper loaded from utils/reporting_utils.R" + "# Coherence map helper loaded from utils/snt_dhis2_outliers_imputation_median_report.r" ] }, { From 1dbb87fa98d15dbcf31406f826dad87db5481d53 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 26 Mar 2026 15:18:36 +0100 Subject: [PATCH 10/23] fix --- .../snt_dhis2_outliers_imputation_magic_glasses_report.r | 8 +++++++- .../utils/snt_dhis2_outliers_imputation_mean_report.r | 8 +++++++- .../utils/snt_dhis2_outliers_imputation_median_report.r | 8 +++++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r index 8c531c5..b754d07 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r @@ -1,5 +1,11 @@ # Report helpers for magic glasses outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") +} else if (!is.na(.this_file)) { + dirname(.this_file) +} else { + getwd() +} source(file.path(.this_dir, "snt_dhis2_outliers_imputation_magic_glasses.r")) diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r index e9dd658..8aca814 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r @@ -1,6 +1,12 @@ # Report helpers for mean outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") +} else if (!is.na(.this_file)) { + dirname(.this_file) +} else { + getwd() +} source(file.path(.this_dir, "snt_dhis2_outliers_imputation_mean.r")) printdim <- function(df, name = deparse(substitute(df))) { diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r index 52096f9..9815e39 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r @@ -1,6 +1,12 @@ # Report helpers for median outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (!is.na(.this_file)) dirname(.this_file) else getwd() +.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") +} else if (!is.na(.this_file)) { + dirname(.this_file) +} else { + getwd() +} source(file.path(.this_dir, "snt_dhis2_outliers_imputation_median.r")) printdim <- function(df, name = deparse(substitute(df))) { From b5f9e3d02360a7a1202d3661300ec09e1efaf921 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 26 Mar 2026 17:24:12 +0100 Subject: [PATCH 11/23] still bugged --- ...outliers_imputation_magic_glasses_report.r | 27 +- ...nt_dhis2_outliers_imputation_mean_report.r | 310 ++++++++++++------ ..._dhis2_outliers_imputation_median_report.r | 310 ++++++++++++------ 3 files changed, 456 insertions(+), 191 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r index b754d07..c3a9df6 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses_report.r @@ -1,11 +1,24 @@ # Report helpers for magic glasses outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { - file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") -} else if (!is.na(.this_file)) { - dirname(.this_file) -} else { - getwd() +.candidate_files <- unique(c( + if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils", "snt_dhis2_outliers_imputation_magic_glasses.r") + } else { + character(0) + }, + if (!is.na(.this_file)) { + file.path(dirname(.this_file), "snt_dhis2_outliers_imputation_magic_glasses.r") + } else { + character(0) + }, + file.path(getwd(), "snt_dhis2_outliers_imputation_magic_glasses.r") +)) +.target_file <- .candidate_files[file.exists(.candidate_files)][1] +if (is.na(.target_file)) { + stop(paste0( + "Could not locate snt_dhis2_outliers_imputation_magic_glasses.r. Tried: ", + paste(.candidate_files, collapse = " | ") + )) } -source(file.path(.this_dir, "snt_dhis2_outliers_imputation_magic_glasses.r")) +source(.target_file) diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r index 8aca814..ce792ef 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_mean/utils/snt_dhis2_outliers_imputation_mean_report.r @@ -1,13 +1,28 @@ # Report helpers for mean outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { - file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") -} else if (!is.na(.this_file)) { - dirname(.this_file) -} else { - getwd() +.candidate_files <- unique(c( + if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils", "snt_dhis2_outliers_imputation_mean.r") + } else { + character(0) + }, + if (!is.na(.this_file)) { + file.path(dirname(.this_file), "snt_dhis2_outliers_imputation_mean.r") + } else { + character(0) + }, + file.path(getwd(), "snt_dhis2_outliers_imputation_mean.r") +)) +.target_file <- .candidate_files[file.exists(.candidate_files)][1] +if (is.na(.target_file)) { + stop(paste0( + "Could not locate snt_dhis2_outliers_imputation_mean.r. Tried: ", + paste(.candidate_files, collapse = " | ") + )) } -source(file.path(.this_dir, "snt_dhis2_outliers_imputation_mean.r")) +source(.target_file) + +`%||%` <- function(x, y) if (!is.null(x)) x else y printdim <- function(df, name = deparse(substitute(df))) { cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") @@ -64,123 +79,234 @@ plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { } plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { - if (!(agg_level %in% c("ADM1_NAME", "ADM2_NAME"))) stop("agg_level must be ADM1_NAME or ADM2_NAME") - if (!all(c("INDICATOR", "YEAR", agg_level, "VALUE", "VALUE_IMPUTED") %in% colnames(df))) { - stop("Data frame is missing required columns.") - } - comp <- df %>% - dplyr::filter(YEAR == selected_year) %>% - dplyr::group_by(INDICATOR, !!rlang::sym(agg_level)) %>% - dplyr::summarise( - coherence = ifelse(sum(!is.na(VALUE)) == 0, NA, sum(VALUE == VALUE_IMPUTED, na.rm = TRUE) / sum(!is.na(VALUE))), - n = dplyr::n(), - .groups = "drop" + if (!all(c("YEAR", "check_label", "pct_coherent") %in% names(df))) return(NULL) + if (!agg_level %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate(YEAR = as.integer(.data$YEAR)) %>% + dplyr::filter(.data$YEAR == as.integer(selected_year)) %>% + dplyr::mutate( + agg = as.character(.data[[agg_level]]), + check_label = as.character(.data$check_label) ) - p <- ggplot2::ggplot(comp, ggplot2::aes(x = .data[[agg_level]], y = INDICATOR, fill = coherence)) + - ggplot2::geom_tile(color = "white", linewidth = 0.2) + - ggplot2::scale_fill_gradient(low = "#fee5d9", high = "#a50f15", na.value = "grey90", limits = c(0, 1)) + + + if (nrow(d) == 0) return(NULL) + + p <- ggplot2::ggplot(d, ggplot2::aes( + x = .data$check_label, + y = .data$agg, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile() + + ggplot2::scale_fill_viridis_c( + name = "% coherent", + option = "viridis", + limits = c(0, 100) + ) + ggplot2::labs( - title = paste("Coherence heatmap -", agg_level, "-", selected_year), - x = agg_level, - y = "Indicator", - fill = "Coherence" + title = sprintf("Coherence (%s) - %s", agg_level, selected_year), + x = NULL, + y = NULL ) + ggplot2::theme_minimal(base_size = 12) + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) - if (!is.null(filename)) ggplot2::ggsave(filename, p, width = 12, height = 6) - if (isTRUE(do_plot)) print(p) + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), + plot.title = ggplot2::element_text(face = "bold") + ) + + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = 8, dpi = 150) + } + + if (do_plot) print(p) invisible(p) } plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { - if (!inherits(map_data, "sf")) stop("map_data must be an sf object.") - if (!(col_name %in% names(map_data))) stop(paste("Column", col_name, "not found in map_data.")) - ttl <- ifelse(is.null(indicator_label), paste("Map of", col_name), paste("Map of", col_name, "-", indicator_label)) + if (!inherits(map_data, "sf")) return(NULL) + if (!col_name %in% names(map_data)) return(NULL) + ggplot2::ggplot(map_data) + - ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "grey30", linewidth = 0.1) + - ggplot2::scale_fill_viridis_c(option = "C", na.value = "grey90") + - ggplot2::labs(title = ttl, fill = col_name) + - ggplot2::theme_minimal(base_size = 12) + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = NA) + + ggplot2::scale_fill_viridis_c( + option = "viridis", + name = indicator_label %||% col_name, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::labs(title = indicator_label %||% col_name) + + ggplot2::theme_void(base_size = 12) + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", hjust = 0.5), + legend.position = "right" + ) } get_coherence_definitions <- function() { checks <- list( - "long_term" = function(x) (x >= 0.95), - "short_term" = function(x) (x >= 0.95), - "cyclicality" = function(x) (x >= 0.90), - "volatility" = function(x) (x >= 0.90), - "rolling_sd" = function(x) (x <= 0.80), - "spatial" = function(x) (x <= 0.80), - "residual" = function(x) (x <= 2), - "trend_strength" = function(x) (x >= 0.20) + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") ) + check_labels <- c( - "long_term" = "Long-term (>= 95%)", - "short_term" = "Short-term (>= 95%)", - "cyclicality" = "Cyclicality (>= 90%)", - "volatility" = "Volatility (>= 90%)", - "rolling_sd" = "Rolling SD (<= 80%)", - "spatial" = "Spatial (<= 80%)", - "residual" = "Residual (<= 2)", - "trend_strength" = "Trend strength (>= 20%)" + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" ) + list(checks = checks, check_labels = check_labels) } compute_national_coherency_metrics <- function(df, checks, check_labels) { - coherency_metrics <- purrr::imap_dfr(checks, function(cond, check_name) { - vals <- df[[check_name]] - tibble::tibble( - check = check_name, - label = check_labels[[check_name]], - percent = round(100 * mean(cond(vals), na.rm = TRUE), 1) + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + if (length(check_cols) == 0) { + return(tibble::tibble( + YEAR = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = factor() + )) + } + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) ) - }) - coherency_metrics$label <- factor(coherency_metrics$label, levels = rev(check_labels)) - coherency_metrics } plot_national_coherence_heatmap <- function(coherency_metrics) { - ggplot2::ggplot(coherency_metrics, ggplot2::aes(x = 1, y = label, fill = percent)) + - ggplot2::geom_tile(color = "white", width = 0.95, height = 0.9) + - ggplot2::geom_text(ggplot2::aes(label = paste0(percent, "%")), size = 4, color = "black", fontface = "bold") + - ggplot2::scale_fill_gradient2( - low = "#f7fcf5", mid = "#74c476", high = "#00441b", - midpoint = 85, limits = c(0, 100), name = "% indicators pass" + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 ) + - ggplot2::scale_x_continuous(expand = c(0, 0)) + ggplot2::labs( - title = "National coherence overview", - subtitle = "Percentage of indicators meeting each coherence criterion", - x = NULL, y = NULL + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL ) + - ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme_minimal(base_size = 14) + ggplot2::theme( - axis.text.x = ggplot2::element_blank(), - axis.ticks = ggplot2::element_blank(), panel.grid = ggplot2::element_blank(), - legend.position = "right", - plot.title = ggplot2::element_text(face = "bold"), - plot.subtitle = ggplot2::element_text(color = "gray30"), - axis.text.y = ggplot2::element_text(face = "bold") + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") ) } compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { - ADM_levels <- c("ADM1_NAME", "ADM2_NAME", "OU_NAME") - adm_long <- lapply(ADM_levels, function(level) { - df %>% - dplyr::filter(!is.na(.data[[level]]), !is.na(INDICATOR)) %>% - dplyr::group_by(.data[[level]], INDICATOR) %>% - dplyr::summarise( - dplyr::across(dplyr::all_of(names(checks)), ~ mean(checks[[cur_column()]](.x), na.rm = TRUE)), - n_reports = dplyr::n(), - .groups = "drop" + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + if (length(valid_checks) == 0) { + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise(total_reports = dplyr::n(), .groups = "drop") %>% + dplyr::filter(.data$total_reports >= min_reports) + adm_long <- tibble::tibble( + ADM1_NAME = character(), + ADM2_NAME = character(), + ADM2_ID = character(), + YEAR = integer(), + total_reports = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = character() + ) + return(list(adm_coherence = adm_coherence, adm_long = adm_long)) + } + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) ) %>% - dplyr::filter(n_reports >= min_reports) %>% - tidyr::pivot_longer(cols = dplyr::all_of(names(checks)), names_to = "check", values_to = "coherence_rate") %>% - dplyr::mutate(level = level, label = check_labels[check]) - }) %>% dplyr::bind_rows() - adm_long$label <- factor(adm_long$label, levels = rev(check_labels)) - adm_long + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) } diff --git a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r index 9815e39..ea3d558 100644 --- a/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r +++ b/pipelines/snt_dhis2_outliers_imputation_median/utils/snt_dhis2_outliers_imputation_median_report.r @@ -1,13 +1,28 @@ # Report helpers for median outliers imputation pipeline. .this_file <- tryCatch(normalizePath(sys.frame(1)$ofile), error = function(e) NA_character_) -.this_dir <- if (exists("PIPELINE_PATH", inherits = TRUE)) { - file.path(get("PIPELINE_PATH", inherits = TRUE), "utils") -} else if (!is.na(.this_file)) { - dirname(.this_file) -} else { - getwd() +.candidate_files <- unique(c( + if (exists("PIPELINE_PATH", inherits = TRUE)) { + file.path(get("PIPELINE_PATH", inherits = TRUE), "utils", "snt_dhis2_outliers_imputation_median.r") + } else { + character(0) + }, + if (!is.na(.this_file)) { + file.path(dirname(.this_file), "snt_dhis2_outliers_imputation_median.r") + } else { + character(0) + }, + file.path(getwd(), "snt_dhis2_outliers_imputation_median.r") +)) +.target_file <- .candidate_files[file.exists(.candidate_files)][1] +if (is.na(.target_file)) { + stop(paste0( + "Could not locate snt_dhis2_outliers_imputation_median.r. Tried: ", + paste(.candidate_files, collapse = " | ") + )) } -source(file.path(.this_dir, "snt_dhis2_outliers_imputation_median.r")) +source(.target_file) + +`%||%` <- function(x, y) if (!is.null(x)) x else y printdim <- function(df, name = deparse(substitute(df))) { cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") @@ -64,123 +79,234 @@ plot_outliers_by_district_facet_year <- function(ind_name, df, outlier_col) { } plot_coherence_heatmap <- function(df, selected_year, agg_level = "ADM1_NAME", filename = NULL, do_plot = TRUE) { - if (!(agg_level %in% c("ADM1_NAME", "ADM2_NAME"))) stop("agg_level must be ADM1_NAME or ADM2_NAME") - if (!all(c("INDICATOR", "YEAR", agg_level, "VALUE", "VALUE_IMPUTED") %in% colnames(df))) { - stop("Data frame is missing required columns.") - } - comp <- df %>% - dplyr::filter(YEAR == selected_year) %>% - dplyr::group_by(INDICATOR, !!rlang::sym(agg_level)) %>% - dplyr::summarise( - coherence = ifelse(sum(!is.na(VALUE)) == 0, NA, sum(VALUE == VALUE_IMPUTED, na.rm = TRUE) / sum(!is.na(VALUE))), - n = dplyr::n(), - .groups = "drop" + if (!all(c("YEAR", "check_label", "pct_coherent") %in% names(df))) return(NULL) + if (!agg_level %in% names(df)) return(NULL) + + d <- df %>% + dplyr::mutate(YEAR = as.integer(.data$YEAR)) %>% + dplyr::filter(.data$YEAR == as.integer(selected_year)) %>% + dplyr::mutate( + agg = as.character(.data[[agg_level]]), + check_label = as.character(.data$check_label) ) - p <- ggplot2::ggplot(comp, ggplot2::aes(x = .data[[agg_level]], y = INDICATOR, fill = coherence)) + - ggplot2::geom_tile(color = "white", linewidth = 0.2) + - ggplot2::scale_fill_gradient(low = "#fee5d9", high = "#a50f15", na.value = "grey90", limits = c(0, 1)) + + + if (nrow(d) == 0) return(NULL) + + p <- ggplot2::ggplot(d, ggplot2::aes( + x = .data$check_label, + y = .data$agg, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile() + + ggplot2::scale_fill_viridis_c( + name = "% coherent", + option = "viridis", + limits = c(0, 100) + ) + ggplot2::labs( - title = paste("Coherence heatmap -", agg_level, "-", selected_year), - x = agg_level, - y = "Indicator", - fill = "Coherence" + title = sprintf("Coherence (%s) - %s", agg_level, selected_year), + x = NULL, + y = NULL ) + ggplot2::theme_minimal(base_size = 12) + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) - if (!is.null(filename)) ggplot2::ggsave(filename, p, width = 12, height = 6) - if (isTRUE(do_plot)) print(p) + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), + plot.title = ggplot2::element_text(face = "bold") + ) + + if (!is.null(filename)) { + ggplot2::ggsave(filename = filename, plot = p, width = 14, height = 8, dpi = 150) + } + + if (do_plot) print(p) invisible(p) } plot_coherence_map <- function(map_data, col_name, indicator_label = NULL) { - if (!inherits(map_data, "sf")) stop("map_data must be an sf object.") - if (!(col_name %in% names(map_data))) stop(paste("Column", col_name, "not found in map_data.")) - ttl <- ifelse(is.null(indicator_label), paste("Map of", col_name), paste("Map of", col_name, "-", indicator_label)) + if (!inherits(map_data, "sf")) return(NULL) + if (!col_name %in% names(map_data)) return(NULL) + ggplot2::ggplot(map_data) + - ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = "grey30", linewidth = 0.1) + - ggplot2::scale_fill_viridis_c(option = "C", na.value = "grey90") + - ggplot2::labs(title = ttl, fill = col_name) + - ggplot2::theme_minimal(base_size = 12) + ggplot2::geom_sf(ggplot2::aes(fill = .data[[col_name]]), color = NA) + + ggplot2::scale_fill_viridis_c( + option = "viridis", + name = indicator_label %||% col_name, + limits = c(0, 100), + na.value = "grey90" + ) + + ggplot2::labs(title = indicator_label %||% col_name) + + ggplot2::theme_void(base_size = 12) + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", hjust = 0.5), + legend.position = "right" + ) } get_coherence_definitions <- function() { checks <- list( - "long_term" = function(x) (x >= 0.95), - "short_term" = function(x) (x >= 0.95), - "cyclicality" = function(x) (x >= 0.90), - "volatility" = function(x) (x >= 0.90), - "rolling_sd" = function(x) (x <= 0.80), - "spatial" = function(x) (x <= 0.80), - "residual" = function(x) (x <= 2), - "trend_strength" = function(x) (x >= 0.20) + allout_susp = c("ALLOUT", "SUSP"), + allout_test = c("ALLOUT", "TEST"), + susp_test = c("SUSP", "TEST"), + test_conf = c("TEST", "CONF"), + conf_treat = c("CONF", "MALTREAT"), + adm_dth = c("MALADM", "MALDTH") ) + check_labels <- c( - "long_term" = "Long-term (>= 95%)", - "short_term" = "Short-term (>= 95%)", - "cyclicality" = "Cyclicality (>= 90%)", - "volatility" = "Volatility (>= 90%)", - "rolling_sd" = "Rolling SD (<= 80%)", - "spatial" = "Spatial (<= 80%)", - "residual" = "Residual (<= 2)", - "trend_strength" = "Trend strength (>= 20%)" + pct_coherent_allout_susp = "Ambulatoire >= Suspects", + pct_coherent_allout_test = "Ambulatoire >= Testes", + pct_coherent_susp_test = "Suspects >= Testes", + pct_coherent_test_conf = "Testes >= Confirmes", + pct_coherent_conf_treat = "Confirmes >= Traites", + pct_coherent_adm_dth = "Admissions Palu >= Deces Palu" ) + list(checks = checks, check_labels = check_labels) } compute_national_coherency_metrics <- function(df, checks, check_labels) { - coherency_metrics <- purrr::imap_dfr(checks, function(cond, check_name) { - vals <- df[[check_name]] - tibble::tibble( - check = check_name, - label = check_labels[[check_name]], - percent = round(100 * mean(cond(vals), na.rm = TRUE), 1) + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- intersect(paste0("check_", names(checks)), names(df_checks)) + if (length(check_cols) == 0) { + return(tibble::tibble( + YEAR = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = factor() + )) + } + + df_checks %>% + dplyr::group_by(.data$YEAR) %>% + dplyr::summarise( + dplyr::across( + dplyr::all_of(check_cols), + ~ mean(.x, na.rm = TRUE) * 100, + .names = "pct_{.col}" + ), + .groups = "drop" + ) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_"), + names_to = "check_type", + names_prefix = "pct_check_", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate( + check_label = dplyr::recode( + .data$check_type, + !!!stats::setNames(check_labels, sub("^pct_coherent_", "", names(check_labels))) + ), + check_label = factor(.data$check_label, levels = unique(.data$check_label)), + check_label = forcats::fct_reorder(.data$check_label, .data$pct_coherent, .fun = median, na.rm = TRUE) ) - }) - coherency_metrics$label <- factor(coherency_metrics$label, levels = rev(check_labels)) - coherency_metrics } plot_national_coherence_heatmap <- function(coherency_metrics) { - ggplot2::ggplot(coherency_metrics, ggplot2::aes(x = 1, y = label, fill = percent)) + - ggplot2::geom_tile(color = "white", width = 0.95, height = 0.9) + - ggplot2::geom_text(ggplot2::aes(label = paste0(percent, "%")), size = 4, color = "black", fontface = "bold") + - ggplot2::scale_fill_gradient2( - low = "#f7fcf5", mid = "#74c476", high = "#00441b", - midpoint = 85, limits = c(0, 100), name = "% indicators pass" + ggplot2::ggplot(coherency_metrics, ggplot2::aes( + x = factor(.data$YEAR), + y = .data$check_label, + fill = .data$pct_coherent + )) + + ggplot2::geom_tile(color = NA, width = 0.88, height = 0.88) + + ggplot2::geom_text( + ggplot2::aes(label = sprintf("%.0f%%", .data$pct_coherent)), + color = "white", + fontface = "bold", + size = 5 + ) + + viridis::scale_fill_viridis( + name = "% Coherent", + option = "viridis", + limits = c(0, 100), + direction = -1 ) + - ggplot2::scale_x_continuous(expand = c(0, 0)) + ggplot2::labs( - title = "National coherence overview", - subtitle = "Percentage of indicators meeting each coherence criterion", - x = NULL, y = NULL + title = "Controles de coherence des donnees (niveau national)", + x = "Annee", + y = NULL ) + - ggplot2::theme_minimal(base_size = 13) + + ggplot2::theme_minimal(base_size = 14) + ggplot2::theme( - axis.text.x = ggplot2::element_blank(), - axis.ticks = ggplot2::element_blank(), panel.grid = ggplot2::element_blank(), - legend.position = "right", - plot.title = ggplot2::element_text(face = "bold"), - plot.subtitle = ggplot2::element_text(color = "gray30"), - axis.text.y = ggplot2::element_text(face = "bold") + plot.title = ggplot2::element_text(size = 22, face = "bold", hjust = 0.5), + axis.text.y = ggplot2::element_text(size = 16, hjust = 0), + axis.text.x = ggplot2::element_text(size = 16), + legend.title = ggplot2::element_text(size = 16, face = "bold"), + legend.text = ggplot2::element_text(size = 14), + legend.key.width = grid::unit(0.7, "cm"), + legend.key.height = grid::unit(1.2, "cm") ) } compute_adm_coherence_long <- function(df, checks, check_labels, min_reports = 5) { - ADM_levels <- c("ADM1_NAME", "ADM2_NAME", "OU_NAME") - adm_long <- lapply(ADM_levels, function(level) { - df %>% - dplyr::filter(!is.na(.data[[level]]), !is.na(INDICATOR)) %>% - dplyr::group_by(.data[[level]], INDICATOR) %>% - dplyr::summarise( - dplyr::across(dplyr::all_of(names(checks)), ~ mean(checks[[cur_column()]](.x), na.rm = TRUE)), - n_reports = dplyr::n(), - .groups = "drop" + df_checks <- df %>% + dplyr::mutate( + !!!lapply(names(checks), function(check_name) { + cols <- checks[[check_name]] + if (all(cols %in% names(df))) { + rlang::expr(!!rlang::sym(cols[1]) >= !!rlang::sym(cols[2])) + } else { + rlang::expr(NA_real_) + } + }) %>% stats::setNames(paste0("check_", names(checks))) + ) + + check_cols <- names(df_checks)[grepl("^check_", names(df_checks))] + valid_checks <- check_cols[ + purrr::map_lgl(df_checks[check_cols], ~ !all(is.na(.x))) + ] + if (length(valid_checks) == 0) { + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise(total_reports = dplyr::n(), .groups = "drop") %>% + dplyr::filter(.data$total_reports >= min_reports) + adm_long <- tibble::tibble( + ADM1_NAME = character(), + ADM2_NAME = character(), + ADM2_ID = character(), + YEAR = integer(), + total_reports = integer(), + check_type = character(), + pct_coherent = numeric(), + check_label = character() + ) + return(list(adm_coherence = adm_coherence, adm_long = adm_long)) + } + + adm_coherence <- df_checks %>% + dplyr::group_by(.data$ADM1_NAME, .data$ADM2_NAME, .data$ADM2_ID, .data$YEAR) %>% + dplyr::summarise( + total_reports = dplyr::n(), + !!!purrr::map( + valid_checks, + ~ rlang::expr(100 * mean(.data[[.x]], na.rm = TRUE)) ) %>% - dplyr::filter(n_reports >= min_reports) %>% - tidyr::pivot_longer(cols = dplyr::all_of(names(checks)), names_to = "check", values_to = "coherence_rate") %>% - dplyr::mutate(level = level, label = check_labels[check]) - }) %>% dplyr::bind_rows() - adm_long$label <- factor(adm_long$label, levels = rev(check_labels)) - adm_long + stats::setNames(paste0("pct_coherent_", sub("^check_", "", valid_checks))), + .groups = "drop" + ) %>% + dplyr::filter(.data$total_reports >= min_reports) + + adm_long <- adm_coherence %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("pct_coherent_"), + names_to = "check_type", + values_to = "pct_coherent" + ) %>% + dplyr::filter(!is.na(.data$pct_coherent)) %>% + dplyr::mutate(check_label = dplyr::recode(.data$check_type, !!!check_labels)) + + list(adm_coherence = adm_coherence, adm_long = adm_long) } From 615fab0180a17030c14012cbc2fb5d4167d4df7e Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 27 Mar 2026 13:45:33 +0100 Subject: [PATCH 12/23] fix --- ...s2_outliers_imputation_magic_glasses.ipynb | 37 +++++++++- .../snt_dhis2_outliers_imputation_mean.ipynb | 68 ++++++++++--------- 2 files changed, 72 insertions(+), 33 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index cfdccf2..be13bd5 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -93,13 +93,34 @@ " log_msg(glue::glue(\"Using parallel seasonal detection with {SEASONAL_WORKERS} workers\"))\n", "}\n", "\n", - "config_json <- fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME))\n", + "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME))\n", + "\n", + "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\")\n", + "for (conf in snt_config_mandatory) {\n", + " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", + " msg <- paste(\"Missing configuration input:\", conf)\n", + " log_msg(msg)\n", + " stop(msg)\n", + " }\n", + "}\n", + "\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "fixed_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\")\n", "indicators_to_keep <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", "\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\"))\n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- glue::glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\")\n", + " log_msg(msg)\n", + " stop(msg)\n", + "})\n", + "log_msg(glue::glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", + "log_msg(glue::glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", + "\n", + "if (all(c(\"YEAR\", \"MONTH\") %in% names(dhis2_routine))) {\n", + " dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)\n", + "}\n", "\n", "cols_to_select <- intersect(c(fixed_cols, indicators_to_keep), names(dhis2_routine))\n", "dt_routine <- as.data.table(dhis2_routine)[, ..cols_to_select]\n", @@ -113,6 +134,18 @@ " variable.factor = FALSE\n", ")\n", "\n", + "# Remove duplicated values (same strategy as mean/median)\n", + "dup_keys <- c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"YEAR\", \"MONTH\", \"INDICATOR\")\n", + "dup_keys <- intersect(dup_keys, names(dhis2_routine_long))\n", + "if (length(dup_keys) > 0) {\n", + " duplicated <- dhis2_routine_long[, .N, by = dup_keys][N > 1L]\n", + " if (nrow(duplicated) > 0) {\n", + " log_msg(glue::glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", + " data.table::setkeyv(dhis2_routine_long, dup_keys)\n", + " dhis2_routine_long <- unique(dhis2_routine_long)\n", + " }\n", + "}\n", + "\n", "if (DEV_SUBSET) {\n", " unique_adm1 <- unique(dhis2_routine_long$ADM1_ID)\n", " adm1_to_keep <- unique_adm1[seq_len(min(DEV_SUBSET_ADM1_N, length(unique_adm1)))]\n", diff --git a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb index 9413267..2a4ac29 100644 --- a/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_mean/code/snt_dhis2_outliers_imputation_mean.ipynb @@ -131,12 +131,15 @@ }, "outputs": [], "source": [ - "# Check SNT configuration (shared helper)\n", - "validate_required_config_keys(\n", - " config_json = config_json,\n", - " keys = c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\"),\n", - " section = \"SNT_CONFIG\"\n", - ")\n", + "# Check SNT configuration \n", + "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") \n", + "for (conf in snt_config_mandatory) {\n", + " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", + " msg <- paste(\"Missing configuration input:\", conf)\n", + " log_msg(msg)\n", + " stop(msg)\n", + " }\n", + "}\n", "\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", @@ -174,14 +177,16 @@ }, "outputs": [], "source": [ - "# Load file from dataset (formatting) using shared helper\n", + "# Load file from dataset (formatting)\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- load_country_file_from_dataset(\n", - " dataset_id = dataset_name,\n", - " country_code = COUNTRY_CODE,\n", - " suffix = \"_routine.parquet\",\n", - " label = \"DHIS2 routine data\"\n", - ")\n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\") # log error message\n", + " log_msg(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "log_msg(glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", "\n", "log_msg(glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", "print(dim(dhis2_routine))\n", @@ -199,9 +204,8 @@ }, "outputs": [], "source": [ - "# YEAR and MONTH should be integers\n", - "# Use shared helper to normalize types when columns exist\n", - "dhis2_routine <- normalize_year_month_types(dhis2_routine)" + "# YEAR and MONTH should be integers; in the input data they are numeric, but we later use them as integers\n", + "dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)" ] }, { @@ -262,13 +266,9 @@ }, "outputs": [], "source": [ - "# Use shared helper to select, pivot and deduplicate routine rows\n", - "dhis2_routine_long <- prepare_routine_long(\n", - " routine_df = dhis2_routine,\n", - " fixed_cols = fixed_cols,\n", - " indicators = DHIS2_INDICATORS,\n", - " deduplicate = TRUE\n", - ")\n", + "dhis2_routine_long <- dhis2_routine %>%\n", + " select(all_of(c(fixed_cols, DHIS2_INDICATORS))) %>%\n", + " pivot_longer(cols = all_of(DHIS2_INDICATORS), names_to = \"INDICATOR\", values_to = \"VALUE\")\n", "\n", "print(dim(dhis2_routine_long))\n", "head(dhis2_routine_long, 2)" @@ -293,8 +293,19 @@ }, "outputs": [], "source": [ - "# Duplicates are handled by prepare_routine_long(..., deduplicate = TRUE)\n", - "log_msg(\"Routine long data prepared with shared helper (deduplication applied).\")" + "# check if there are any duplicates\n", + "duplicated <- dhis2_routine_long %>%\n", + " group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, YEAR, MONTH, INDICATOR) %>%\n", + " summarise(n = dplyr::n(), .groups= \"drop\") %>%\n", + " filter(n > 1L)\n", + "\n", + "# Remove dups\n", + "if (nrow(duplicated) > 0) {\n", + " log_msg(glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", + " dhis2_routine_long <- dhis2_routine_long %>%\n", + " distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, YEAR, MONTH, INDICATOR, .keep_all = TRUE)\n", + " head(duplicated)\n", + "}" ] }, { @@ -672,12 +683,7 @@ }, "outputs": [], "source": [ - "output_path <- standard_output_path(\n", - " data_root_path = DATA_PATH,\n", - " domain = \"dhis2\",\n", - " subdomain = \"outliers_imputation\",\n", - " create_dir = TRUE\n", - ")\n", + "output_path <- file.path(DATA_PATH, \"dhis2\", \"outliers_imputation\")\n", "\n", "# Mean detection table (for DB and reporting)\n", "outlier_col <- colnames(dhis2_routine_outliers_selection)[startsWith(colnames(dhis2_routine_outliers_selection), \"OUTLIER_\")][1]\n", From a4d521997ac43cee6e27ed73b3f701249502b4da Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 27 Mar 2026 14:42:14 +0100 Subject: [PATCH 13/23] last fix --- .../snt_dhis2_outliers_imputation_path.ipynb | 130 +++--------------- .../snt_dhis2_outliers_imputation_path.r | 106 ++++++++++++++ 2 files changed, 125 insertions(+), 111 deletions(-) create mode 100644 pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r diff --git a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb index 7504401..3fb9d5e 100644 --- a/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_path/code/snt_dhis2_outliers_imputation_path.ipynb @@ -65,14 +65,16 @@ }, "outputs": [], "source": [ - "# Project folders\n", - "ROOT_PATH <- \"~/workspace\" \n", + "# Project folders (ROOT_PATH injected by pipeline if available)\n", + "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\" \n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_path\")\n", "CODE_PATH <- file.path(ROOT_PATH, 'code') \n", "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", "\n", "# Load utils\n", "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_path.r\"))\n", "\n", "# Load libraries \n", "required_packages <- c(\"arrow\", \"tidyverse\", \"jsonlite\", \"DBI\", \"RPostgres\", \"reticulate\", \"glue\")\n", @@ -84,7 +86,7 @@ "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", "\n", "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "openhexa <- reticulate::import(\"openhexa.sdk\")" ] }, { @@ -129,7 +131,7 @@ "outputs": [], "source": [ "# Load SNT config\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", " error = function(e) {\n", " msg <- glue(\"[ERROR] Error while loading configuration {conditionMessage(e)}\")\n", " log_msg(msg)\n", @@ -270,12 +272,8 @@ }, "outputs": [], "source": [ - "dhis2_routine_long <- dhis2_routine %>%\n", - " select(all_of(c(\"ADM1_ID\", \"ADM1_NAME\", \"ADM2_ID\", \"ADM2_NAME\", \"OU_ID\", \"OU_NAME\", \"PERIOD\", DHIS2_INDICATORS))) %>%\n", - " pivot_longer(cols = all_of(DHIS2_INDICATORS), names_to = \"INDICATOR\", values_to = \"VALUE\") %>%\n", - " # ⚠️ NEW: Complete missing date-indicator combinations for each facility\n", - " complete(nesting(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME), PERIOD, INDICATOR) %>%\n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"INDICATOR\", \"VALUE\")))\n", + "# Helper loaded from utils/snt_dhis2_outliers_imputation_path.r\n", + "dhis2_routine_long <- build_path_routine_long(dhis2_routine, DHIS2_INDICATORS)\n", "\n", "print(dim(dhis2_routine_long))\n", "head(dhis2_routine_long, 2)" @@ -300,17 +298,10 @@ }, "outputs": [], "source": [ - "# check if there are any duplicates\n", - "duplicated <- dhis2_routine_long %>%\n", - " group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>%\n", - " summarise(n = dplyr::n(), .groups= \"drop\") %>%\n", - " filter(n > 1L)\n", - "\n", - "# Remove dups\n", + "dedup_result <- remove_path_duplicates(dhis2_routine_long)\n", + "dhis2_routine_long <- dedup_result$data\n", + "duplicated <- dedup_result$duplicated\n", "if (nrow(duplicated) > 0) {\n", - " log_msg(glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", - " dhis2_routine_long <- dhis2_routine_long %>%\n", - " distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR, .keep_all = TRUE)\n", " head(duplicated)\n", "}" ] @@ -413,23 +404,7 @@ }, "outputs": [], "source": [ - "# high presumed cases during lower tests\n", - "low_testing_periods <- dhis2_routine_outliers %>%\n", - " filter(INDICATOR == \"TEST\") %>%\n", - " mutate(\n", - " low_testing = case_when(VALUE < MEAN_80 ~ TRUE, TRUE ~ FALSE), \n", - " # presumed may not exceed upper limits for tests \n", - " upper_limit_tested = MEAN_80 + MEAN_DEVIATION * SD_80) %>% \n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"low_testing\", \"upper_limit_tested\")))\n", - "\n", - "# decide which one could be possible stock-out periods\n", - "possible_stockout <- dhis2_routine_outliers %>%\n", - " filter(OUTLIER_TREND == TRUE) %>%\n", - " left_join(low_testing_periods, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\")) %>% \n", - " # make sure value does not exceed reasonable figures\n", - " mutate(POSSIBLE_STKOUT = case_when(low_testing == TRUE & INDICATOR == \"PRES\" & VALUE < upper_limit_tested ~ TRUE, TRUE ~ FALSE)) %>%\n", - " filter(POSSIBLE_STKOUT == TRUE) %>%\n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"POSSIBLE_STKOUT\")))" + "possible_stockout <- detect_possible_stockout(dhis2_routine_outliers, MEAN_DEVIATION)" ] }, { @@ -481,23 +456,7 @@ }, "outputs": [], "source": [ - "# ⚠️ UPDATED\n", - "possible_epidemic <- dhis2_routine_outliers %>% \n", - " filter(INDICATOR == \"TEST\" | INDICATOR == \"CONF\") %>%\n", - " rename(total = VALUE) %>% \n", - " # outlier threshold max value\n", - " mutate(max_value = MEAN_80 + MEAN_DEVIATION * SD_80) %>% \n", - " # remove columns not necessary for wider format\n", - " select(-c(\"MEAN_80\", \"SD_80\")) %>% \n", - " # wider format with two values (value and outlier-threshold max value) for each INDICATOR\n", - " pivot_wider(names_from = INDICATOR, values_from = c(total, max_value, OUTLIER_TREND)) %>% \n", - " unnest(cols = everything()) %>% \n", - " # ⚠️ NEW LOGIC: epidemic if CONF is outlier AND (TEST is outlier OR tests >= confirmed)\n", - " mutate(POSSIBLE_EPID = case_when(\n", - " OUTLIER_TREND_CONF == TRUE & (OUTLIER_TREND_TEST == TRUE | total_TEST >= total_CONF) ~ TRUE,\n", - " TRUE ~ FALSE)) %>%\n", - " filter(POSSIBLE_EPID == TRUE) %>% \n", - " select(all_of(c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"POSSIBLE_EPID\")))\n", + "possible_epidemic <- detect_possible_epidemic(dhis2_routine_outliers, MEAN_DEVIATION)\n", "\n", "epidemic_n <- length(unique(possible_epidemic$OU_ID))\n", "if (epidemic_n > 0) { \n", @@ -531,34 +490,11 @@ }, "outputs": [], "source": [ - "# Join columns and correct outliers column\n", - "routine_data_outliers_clean <- dhis2_routine_outliers %>% \n", - " left_join(possible_stockout, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\")) %>%\n", - " mutate(OUTLIER_TREND_01 = case_when(OUTLIER_TREND == TRUE & INDICATOR ==\"PRES\" & POSSIBLE_STKOUT == TRUE ~ FALSE, TRUE ~ OUTLIER_TREND)) %>%\n", - " left_join(possible_epidemic, by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\")) %>%\n", - " mutate(OUTLIER_TREND_02 = case_when(OUTLIER_TREND_01 == TRUE & INDICATOR %in% c(\"CONF\", \"TEST\") & POSSIBLE_EPID == TRUE ~ TRUE, TRUE ~ OUTLIER_TREND_01)) %>% \n", - " select(-OUTLIER_TREND) %>%\n", - " rename(OUTLIER_TREND = OUTLIER_TREND_02) %>% \n", - " mutate(\n", - " YEAR = as.integer(substr(PERIOD, 1, 4)),\n", - " MONTH = as.integer(substr(PERIOD, 5, 6))) %>%\n", - " select(all_of(\n", - " c(\n", - " \"PERIOD\",\n", - " \"YEAR\",\n", - " \"MONTH\",\n", - " \"ADM1_ID\",\n", - " \"ADM2_ID\",\n", - " \"OU_ID\", \n", - " \"INDICATOR\",\n", - " \"VALUE\",\n", - " \"MEAN_80\",\n", - " \"SD_80\",\n", - " \"OUTLIER_TREND\",\n", - " \"POSSIBLE_STKOUT\", \n", - " \"POSSIBLE_EPID\" \n", - " )\n", - " ))\n", + "routine_data_outliers_clean <- build_path_clean_outliers(\n", + " dhis2_routine_outliers = dhis2_routine_outliers,\n", + " possible_stockout = possible_stockout,\n", + " possible_epidemic = possible_epidemic\n", + ")\n", "\n", "print(dim(routine_data_outliers_clean))\n", "head(routine_data_outliers_clean, 2)" @@ -616,35 +552,7 @@ }, "outputs": [], "source": [ - "# ⚠️ UPDATED: Added reversal check to prevent illogical corrections\n", - "# replace outliers by mean_80\n", - "routine_data_outliers_imputed <- routine_data_outliers_clean %>% \n", - " rename(VALUE_OLD = VALUE) %>% \n", - " # replace outliers with the mean 80% value\n", - " mutate(VALUE_IMPUTED = ifelse(OUTLIER_TREND == TRUE, MEAN_80, VALUE_OLD)) %>%\n", - " # ⚠️ NEW: Pivot to check test/conf relationship\n", - " select(all_of(c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"INDICATOR\", \"VALUE_OLD\", \"VALUE_IMPUTED\", \"OUTLIER_TREND\"))) %>%\n", - " pivot_wider(names_from = INDICATOR, values_from = c(VALUE_OLD, VALUE_IMPUTED, OUTLIER_TREND)) %>%\n", - " # ⚠️ NEW: Reversal check - undo corrections if they create impossible situations\n", - " # (i.e., if imputed tests < imputed conf, but original tests > original conf)\n", - " mutate(reverse_val = case_when(\n", - " !is.na(VALUE_IMPUTED_TEST) & !is.na(VALUE_IMPUTED_CONF) & \n", - " VALUE_IMPUTED_TEST < VALUE_IMPUTED_CONF & \n", - " VALUE_OLD_TEST > VALUE_OLD_CONF ~ TRUE,\n", - " TRUE ~ FALSE)) %>%\n", - " # Correct TEST if reversed\n", - " mutate(VALUE_IMPUTED_TEST = ifelse(reverse_val == TRUE, VALUE_OLD_TEST, VALUE_IMPUTED_TEST),\n", - " OUTLIER_TREND_TEST = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_TEST)) %>%\n", - " # Correct CONF if reversed\n", - " mutate(VALUE_IMPUTED_CONF = ifelse(reverse_val == TRUE, VALUE_OLD_CONF, VALUE_IMPUTED_CONF),\n", - " OUTLIER_TREND_CONF = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_CONF)) %>%\n", - " # ⚠️ Pivot back to long format\n", - " select(-reverse_val) %>%\n", - " pivot_longer(cols = starts_with(\"VALUE_OLD_\") | starts_with(\"VALUE_IMPUTED_\") | starts_with(\"OUTLIER_TREND_\"),\n", - " names_to = c(\".value\", \"INDICATOR\"),\n", - " names_pattern = \"(.*)_(.*)$\") %>%\n", - " arrange(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"INDICATOR\") %>%\n", - " select(all_of(c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"INDICATOR\", \"VALUE_OLD\", \"VALUE_IMPUTED\", \"OUTLIER_TREND\")))\n", + "routine_data_outliers_imputed <- impute_path_outliers(routine_data_outliers_clean)\n", "\n", "print(dim(routine_data_outliers_imputed))\n", "head(routine_data_outliers_imputed, 2)" diff --git a/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r new file mode 100644 index 0000000..3348940 --- /dev/null +++ b/pipelines/snt_dhis2_outliers_imputation_path/utils/snt_dhis2_outliers_imputation_path.r @@ -0,0 +1,106 @@ +# Helpers for PATH outliers imputation notebook. + +build_path_routine_long <- function(dhis2_routine, DHIS2_INDICATORS) { + dhis2_routine %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM1_NAME", "ADM2_ID", "ADM2_NAME", "OU_ID", "OU_NAME", "PERIOD", DHIS2_INDICATORS))) %>% + tidyr::pivot_longer(cols = dplyr::all_of(DHIS2_INDICATORS), names_to = "INDICATOR", values_to = "VALUE") %>% + tidyr::complete(tidyr::nesting(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME), PERIOD, INDICATOR) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "INDICATOR", "VALUE"))) +} + +remove_path_duplicates <- function(dhis2_routine_long) { + duplicated <- dhis2_routine_long %>% + dplyr::group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>% + dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% + dplyr::filter(n > 1L) + + if (nrow(duplicated) > 0) { + log_msg(glue::glue("Removing {nrow(duplicated)} duplicated values.")) + dhis2_routine_long <- dhis2_routine_long %>% + dplyr::distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR, .keep_all = TRUE) + } + + list(data = dhis2_routine_long, duplicated = duplicated) +} + +detect_possible_stockout <- function(dhis2_routine_outliers, MEAN_DEVIATION) { + low_testing_periods <- dhis2_routine_outliers %>% + dplyr::filter(INDICATOR == "TEST") %>% + dplyr::mutate( + low_testing = dplyr::case_when(VALUE < MEAN_80 ~ TRUE, TRUE ~ FALSE), + upper_limit_tested = MEAN_80 + MEAN_DEVIATION * SD_80 + ) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "low_testing", "upper_limit_tested"))) + + dhis2_routine_outliers %>% + dplyr::filter(OUTLIER_TREND == TRUE) %>% + dplyr::left_join(low_testing_periods, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% + dplyr::mutate(POSSIBLE_STKOUT = dplyr::case_when(low_testing == TRUE & INDICATOR == "PRES" & VALUE < upper_limit_tested ~ TRUE, TRUE ~ FALSE)) %>% + dplyr::filter(POSSIBLE_STKOUT == TRUE) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "POSSIBLE_STKOUT"))) +} + +detect_possible_epidemic <- function(dhis2_routine_outliers, MEAN_DEVIATION) { + dhis2_routine_outliers %>% + dplyr::filter(INDICATOR == "TEST" | INDICATOR == "CONF") %>% + dplyr::rename(total = VALUE) %>% + dplyr::mutate(max_value = MEAN_80 + MEAN_DEVIATION * SD_80) %>% + dplyr::select(-c("MEAN_80", "SD_80")) %>% + tidyr::pivot_wider(names_from = INDICATOR, values_from = c(total, max_value, OUTLIER_TREND)) %>% + tidyr::unnest(cols = dplyr::everything()) %>% + dplyr::mutate(POSSIBLE_EPID = dplyr::case_when( + OUTLIER_TREND_CONF == TRUE & (OUTLIER_TREND_TEST == TRUE | total_TEST >= total_CONF) ~ TRUE, + TRUE ~ FALSE + )) %>% + dplyr::filter(POSSIBLE_EPID == TRUE) %>% + dplyr::select(dplyr::all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "POSSIBLE_EPID"))) +} + +build_path_clean_outliers <- function(dhis2_routine_outliers, possible_stockout, possible_epidemic) { + dhis2_routine_outliers %>% + dplyr::left_join(possible_stockout, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% + dplyr::mutate(OUTLIER_TREND_01 = dplyr::case_when(OUTLIER_TREND == TRUE & INDICATOR == "PRES" & POSSIBLE_STKOUT == TRUE ~ FALSE, TRUE ~ OUTLIER_TREND)) %>% + dplyr::left_join(possible_epidemic, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% + dplyr::mutate(OUTLIER_TREND_02 = dplyr::case_when(OUTLIER_TREND_01 == TRUE & INDICATOR %in% c("CONF", "TEST") & POSSIBLE_EPID == TRUE ~ TRUE, TRUE ~ OUTLIER_TREND_01)) %>% + dplyr::select(-OUTLIER_TREND) %>% + dplyr::rename(OUTLIER_TREND = OUTLIER_TREND_02) %>% + dplyr::mutate( + YEAR = as.integer(substr(PERIOD, 1, 4)), + MONTH = as.integer(substr(PERIOD, 5, 6)) + ) %>% + dplyr::select(dplyr::all_of(c( + "PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID", + "INDICATOR", "VALUE", "MEAN_80", "SD_80", + "OUTLIER_TREND", "POSSIBLE_STKOUT", "POSSIBLE_EPID" + ))) +} + +impute_path_outliers <- function(routine_data_outliers_clean) { + routine_data_outliers_clean %>% + dplyr::rename(VALUE_OLD = VALUE) %>% + dplyr::mutate(VALUE_IMPUTED = ifelse(OUTLIER_TREND == TRUE, MEAN_80, VALUE_OLD)) %>% + dplyr::select(dplyr::all_of(c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID", "INDICATOR", "VALUE_OLD", "VALUE_IMPUTED", "OUTLIER_TREND"))) %>% + tidyr::pivot_wider(names_from = INDICATOR, values_from = c(VALUE_OLD, VALUE_IMPUTED, OUTLIER_TREND)) %>% + dplyr::mutate(reverse_val = dplyr::case_when( + !is.na(VALUE_IMPUTED_TEST) & !is.na(VALUE_IMPUTED_CONF) & + VALUE_IMPUTED_TEST < VALUE_IMPUTED_CONF & + VALUE_OLD_TEST > VALUE_OLD_CONF ~ TRUE, + TRUE ~ FALSE + )) %>% + dplyr::mutate( + VALUE_IMPUTED_TEST = ifelse(reverse_val == TRUE, VALUE_OLD_TEST, VALUE_IMPUTED_TEST), + OUTLIER_TREND_TEST = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_TEST) + ) %>% + dplyr::mutate( + VALUE_IMPUTED_CONF = ifelse(reverse_val == TRUE, VALUE_OLD_CONF, VALUE_IMPUTED_CONF), + OUTLIER_TREND_CONF = ifelse(reverse_val == TRUE, FALSE, OUTLIER_TREND_CONF) + ) %>% + dplyr::select(-reverse_val) %>% + tidyr::pivot_longer( + cols = dplyr::starts_with("VALUE_OLD_") | dplyr::starts_with("VALUE_IMPUTED_") | dplyr::starts_with("OUTLIER_TREND_"), + names_to = c(".value", "INDICATOR"), + names_pattern = "(.*)_(.*)$" + ) %>% + dplyr::arrange(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>% + dplyr::select(dplyr::all_of(c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID", "INDICATOR", "VALUE_OLD", "VALUE_IMPUTED", "OUTLIER_TREND"))) +} From e3404c85199ac07b8adc136154c095b3afaa1a95 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 27 Mar 2026 15:07:51 +0100 Subject: [PATCH 14/23] Refactor magic glasses input preparation and outlier detection functions for improved modularity and clarity. Introduced `prepare_magic_glasses_input` to streamline input handling and context setup, while enhancing logging and error management. Updated notebook to utilize new function for better organization. --- ...s2_outliers_imputation_magic_glasses.ipynb | 288 +++-------------- ..._dhis2_outliers_imputation_magic_glasses.r | 296 ++++++++++++++++++ 2 files changed, 334 insertions(+), 250 deletions(-) diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb index be13bd5..ce4fdc2 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/code/snt_dhis2_outliers_imputation_magic_glasses.ipynb @@ -66,101 +66,27 @@ "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_outliers_imputation_magic_glasses\")\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_outliers_imputation_magic_glasses.r\"))\n", "\n", - "required_packages <- c(\"arrow\", \"data.table\", \"jsonlite\", \"reticulate\", \"glue\")\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " required_packages <- c(required_packages, \"forecast\")\n", - "}\n", - "if (RUN_MAGIC_GLASSES_COMPLETE && SEASONAL_WORKERS > 1) {\n", - " required_packages <- c(required_packages, \"future\", \"future.apply\")\n", - "}\n", - "\n", - "setup_ctx <- bootstrap_magic_glasses_context(\n", + "mg_input <- prepare_magic_glasses_input(\n", " root_path = ROOT_PATH,\n", - " required_packages = required_packages\n", + " config_file_name = CONFIG_FILE_NAME,\n", + " run_complete = RUN_MAGIC_GLASSES_COMPLETE,\n", + " seasonal_workers = SEASONAL_WORKERS,\n", + " dev_subset = DEV_SUBSET,\n", + " dev_subset_adm1_n = DEV_SUBSET_ADM1_N\n", ")\n", "\n", + "setup_ctx <- mg_input$setup_ctx\n", + "config_json <- mg_input$config_json\n", + "COUNTRY_CODE <- mg_input$country_code\n", + "fixed_cols <- mg_input$fixed_cols\n", + "indicators_to_keep <- mg_input$indicators_to_keep\n", + "dhis2_routine <- mg_input$dhis2_routine\n", + "dhis2_routine_long <- mg_input$dhis2_routine_long\n", + "\n", "CODE_PATH <- setup_ctx$CODE_PATH\n", "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", "DATA_PATH <- setup_ctx$DATA_PATH\n", - "openhexa <- setup_ctx$openhexa\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " log_msg(\"[WARNING] Complete mode: seasonal detection is very computationally intensive and can take several hours to run.\", \"warning\")\n", - "}\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE && SEASONAL_WORKERS > 1) {\n", - " future::plan(future::multisession, workers = SEASONAL_WORKERS)\n", - " log_msg(glue::glue(\"Using parallel seasonal detection with {SEASONAL_WORKERS} workers\"))\n", - "}\n", - "\n", - "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME))\n", - "\n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\")\n", - "for (conf in snt_config_mandatory) {\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " log_msg(msg)\n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "fixed_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\")\n", - "indicators_to_keep <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- glue::glue(\"[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}\")\n", - " log_msg(msg)\n", - " stop(msg)\n", - "})\n", - "log_msg(glue::glue(\"DHIS2 routine data loaded from dataset : {dataset_name}\"))\n", - "log_msg(glue::glue(\"DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.\"))\n", - "\n", - "if (all(c(\"YEAR\", \"MONTH\") %in% names(dhis2_routine))) {\n", - " dhis2_routine[c(\"YEAR\", \"MONTH\")] <- lapply(dhis2_routine[c(\"YEAR\", \"MONTH\")], as.integer)\n", - "}\n", - "\n", - "cols_to_select <- intersect(c(fixed_cols, indicators_to_keep), names(dhis2_routine))\n", - "dt_routine <- as.data.table(dhis2_routine)[, ..cols_to_select]\n", - "\n", - "dhis2_routine_long <- melt(\n", - " dt_routine,\n", - " id.vars = intersect(fixed_cols, names(dt_routine)),\n", - " measure.vars = intersect(indicators_to_keep, names(dt_routine)),\n", - " variable.name = \"INDICATOR\",\n", - " value.name = \"VALUE\",\n", - " variable.factor = FALSE\n", - ")\n", - "\n", - "# Remove duplicated values (same strategy as mean/median)\n", - "dup_keys <- c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\", \"PERIOD\", \"YEAR\", \"MONTH\", \"INDICATOR\")\n", - "dup_keys <- intersect(dup_keys, names(dhis2_routine_long))\n", - "if (length(dup_keys) > 0) {\n", - " duplicated <- dhis2_routine_long[, .N, by = dup_keys][N > 1L]\n", - " if (nrow(duplicated) > 0) {\n", - " log_msg(glue::glue(\"Removing {nrow(duplicated)} duplicated values.\"))\n", - " data.table::setkeyv(dhis2_routine_long, dup_keys)\n", - " dhis2_routine_long <- unique(dhis2_routine_long)\n", - " }\n", - "}\n", - "\n", - "if (DEV_SUBSET) {\n", - " unique_adm1 <- unique(dhis2_routine_long$ADM1_ID)\n", - " adm1_to_keep <- unique_adm1[seq_len(min(DEV_SUBSET_ADM1_N, length(unique_adm1)))]\n", - " dhis2_routine_long <- dhis2_routine_long[ADM1_ID %in% adm1_to_keep]\n", - " log_msg(glue::glue(\"DEV_SUBSET enabled: keeping {length(adm1_to_keep)} ADM1 values\"), \"warning\")\n", - "}\n", - "\n", - "log_msg(glue::glue(\"Data loaded: {nrow(dhis2_routine_long)} rows, {length(unique(dhis2_routine_long$OU_ID))} facilities\"))\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " n_groups <- uniqueN(dhis2_routine_long[, .(OU_ID, INDICATOR)])\n", - " log_msg(glue::glue(\"Complete mode active: seasonal detection will run on up to {n_groups} OU_ID x INDICATOR time series.\"), \"warning\")\n", - "} else {\n", - " log_msg(\"Partial mode active: seasonal detection is skipped.\")\n", - "}" + "openhexa <- setup_ctx$openhexa" ] }, { @@ -175,8 +101,9 @@ "outputs": [], "source": [ "# Helpers loaded from utils/snt_dhis2_outliers_imputation_magic_glasses.r\n", - "# - detect_outliers_mad_custom()\n", - "# - detect_seasonal_outliers()" + "# - prepare_magic_glasses_input()\n", + "# - run_magic_glasses_outlier_detection()\n", + "# - export_magic_glasses_outputs()" ] }, { @@ -190,72 +117,18 @@ }, "outputs": [], "source": [ - "if (RUN_MAGIC_GLASSES_PARTIAL | RUN_MAGIC_GLASSES_COMPLETE) {\n", - " log_msg(\"Starting MAD15 detection...\")\n", - " flagged_outliers_mad15 <- detect_outliers_mad_custom(dhis2_routine_long, DEVIATION_MAD15)\n", - " flagged_outliers_mad15_filtered <- flagged_outliers_mad15[OUTLIER_MAD15 == FALSE]\n", - "\n", - " log_msg(\"Starting MAD10 detection...\")\n", - " flagged_outliers_mad10 <- detect_outliers_mad_custom(flagged_outliers_mad15_filtered, DEVIATION_MAD10)\n", - " setnames(flagged_outliers_mad10, paste0(\"OUTLIER_MAD\", DEVIATION_MAD10), \"OUTLIER_MAD15_MAD10\")\n", - "\n", - " join_cols <- c(\"PERIOD\", \"OU_ID\", \"INDICATOR\")\n", - " mad10_subset <- flagged_outliers_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)]\n", - " flagged_outliers_mad15_mad10 <- merge(\n", - " flagged_outliers_mad15,\n", - " mad10_subset,\n", - " by = join_cols,\n", - " all.x = TRUE\n", - " )\n", - " flagged_outliers_mad15_mad10[is.na(OUTLIER_MAD15_MAD10), OUTLIER_MAD15_MAD10 := TRUE]\n", - " log_msg(glue::glue(\"MAD partial done: {sum(flagged_outliers_mad15_mad10$OUTLIER_MAD15_MAD10)} outliers flagged\"))\n", - "}\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " flagged_outliers_mad15_mad10_filtered <- flagged_outliers_mad15_mad10[OUTLIER_MAD15_MAD10 == FALSE]\n", - "\n", - " if (nrow(flagged_outliers_mad15_mad10_filtered) == 0) {\n", - " log_msg(\"No rows left after MAD partial filtering; seasonal step will be skipped.\", \"warning\")\n", - " flagged_outliers_seasonal5 <- copy(flagged_outliers_mad15_mad10_filtered)\n", - " flagged_outliers_seasonal5[, OUTLIER_SEASONAL5 := FALSE]\n", - " flagged_outliers_seasonal5_filtered <- flagged_outliers_seasonal5\n", - " flagged_outliers_seasonal3 <- copy(flagged_outliers_seasonal5_filtered)\n", - " flagged_outliers_seasonal3[, OUTLIER_SEASONAL3 := FALSE]\n", - " } else {\n", - " log_msg(glue::glue(\"Starting SEASONAL5 detection on {nrow(flagged_outliers_mad15_mad10_filtered)} rows...\"))\n", - " t_seasonal5 <- system.time({\n", - " flagged_outliers_seasonal5 <- detect_seasonal_outliers(\n", - " flagged_outliers_mad15_mad10_filtered,\n", - " deviation = DEVIATION_SEASONAL5,\n", - " workers = SEASONAL_WORKERS\n", - " )\n", - " })\n", - " flagged_outliers_seasonal5_filtered <- flagged_outliers_seasonal5[OUTLIER_SEASONAL5 == FALSE]\n", - " log_msg(glue::glue(\"SEASONAL5 finished in {round(t_seasonal5['elapsed'], 1)}s. Remaining rows: {nrow(flagged_outliers_seasonal5_filtered)}\"))\n", - "\n", - " log_msg(glue::glue(\"Starting SEASONAL3 detection on {nrow(flagged_outliers_seasonal5_filtered)} rows...\"))\n", - " t_seasonal3 <- system.time({\n", - " flagged_outliers_seasonal3 <- detect_seasonal_outliers(\n", - " flagged_outliers_seasonal5_filtered,\n", - " deviation = DEVIATION_SEASONAL3,\n", - " workers = SEASONAL_WORKERS\n", - " )\n", - " })\n", - " log_msg(glue::glue(\"SEASONAL3 finished in {round(t_seasonal3['elapsed'], 1)}s.\"))\n", - " }\n", - "\n", - " setnames(flagged_outliers_seasonal3, paste0(\"OUTLIER_SEASONAL\", DEVIATION_SEASONAL3), \"OUTLIER_SEASONAL5_SEASONAL3\")\n", + "detection_result <- run_magic_glasses_outlier_detection(\n", + " dhis2_routine_long = dhis2_routine_long,\n", + " deviation_mad15 = DEVIATION_MAD15,\n", + " deviation_mad10 = DEVIATION_MAD10,\n", + " run_complete = RUN_MAGIC_GLASSES_COMPLETE,\n", + " deviation_seasonal5 = DEVIATION_SEASONAL5,\n", + " deviation_seasonal3 = DEVIATION_SEASONAL3,\n", + " seasonal_workers = SEASONAL_WORKERS\n", + ")\n", "\n", - " seasonal3_subset <- flagged_outliers_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)]\n", - " flagged_outliers_seasonal5_seasonal3 <- merge(\n", - " flagged_outliers_seasonal5,\n", - " seasonal3_subset,\n", - " by = join_cols,\n", - " all.x = TRUE\n", - " )\n", - " flagged_outliers_seasonal5_seasonal3[is.na(OUTLIER_SEASONAL5_SEASONAL3), OUTLIER_SEASONAL5_SEASONAL3 := TRUE]\n", - " log_msg(glue::glue(\"SEASONAL complete done: {sum(flagged_outliers_seasonal5_seasonal3$OUTLIER_SEASONAL5_SEASONAL3)} outliers flagged\"))\n", - "}" + "flagged_outliers_mad15_mad10 <- detection_result$flagged_outliers_mad15_mad10\n", + "flagged_outliers_seasonal5_seasonal3 <- detection_result$flagged_outliers_seasonal5_seasonal3" ] }, { @@ -269,102 +142,17 @@ }, "outputs": [], "source": [ - "base_cols <- intersect(c(fixed_cols, \"INDICATOR\", \"VALUE\"), names(dhis2_routine_long))\n", - "flagged_outliers_mg <- copy(dhis2_routine_long[, ..base_cols])\n", - "join_cols <- c(\"PERIOD\", \"OU_ID\", \"INDICATOR\")\n", - "\n", - "if (RUN_MAGIC_GLASSES_PARTIAL | RUN_MAGIC_GLASSES_COMPLETE) {\n", - " partial_subset <- flagged_outliers_mad15_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)]\n", - " flagged_outliers_mg <- merge(flagged_outliers_mg, partial_subset, by = join_cols, all.x = TRUE)\n", - " setnames(flagged_outliers_mg, \"OUTLIER_MAD15_MAD10\", \"OUTLIER_MAGIC_GLASSES_PARTIAL\")\n", - "}\n", - "\n", - "if (RUN_MAGIC_GLASSES_COMPLETE) {\n", - " complete_subset <- flagged_outliers_seasonal5_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)]\n", - " flagged_outliers_mg <- merge(flagged_outliers_mg, complete_subset, by = join_cols, all.x = TRUE)\n", - " setnames(flagged_outliers_mg, \"OUTLIER_SEASONAL5_SEASONAL3\", \"OUTLIER_MAGIC_GLASSES_COMPLETE\")\n", - " flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE) & OUTLIER_MAGIC_GLASSES_PARTIAL == TRUE,\n", - " OUTLIER_MAGIC_GLASSES_COMPLETE := TRUE]\n", - "}\n", - "\n", - "if (\"OUTLIER_MAGIC_GLASSES_PARTIAL\" %in% names(flagged_outliers_mg)) {\n", - " flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_PARTIAL), OUTLIER_MAGIC_GLASSES_PARTIAL := FALSE]\n", - "}\n", - "if (\"OUTLIER_MAGIC_GLASSES_COMPLETE\" %in% names(flagged_outliers_mg)) {\n", - " flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE), OUTLIER_MAGIC_GLASSES_COMPLETE := FALSE]\n", - "}\n", - "\n", - "active_outlier_col <- if (\n", - " RUN_MAGIC_GLASSES_COMPLETE && \"OUTLIER_MAGIC_GLASSES_COMPLETE\" %in% names(flagged_outliers_mg)\n", - ") {\n", - " \"OUTLIER_MAGIC_GLASSES_COMPLETE\"\n", - "} else {\n", - " \"OUTLIER_MAGIC_GLASSES_PARTIAL\"\n", - "}\n", - "\n", - "if (!(active_outlier_col %in% names(flagged_outliers_mg))) {\n", - " stop(glue::glue(\"Expected outlier flag column not found: {active_outlier_col}\"))\n", - "}\n", - "\n", - "pyramid_names <- unique(as.data.table(dhis2_routine)[, .(\n", - " ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, OU_ID, OU_NAME\n", - ")])\n", - "\n", - "# 1) Detected table: full routine data + OUTLIER_DETECTED flag (same structure as mean/median/iqr/path)\n", - "outlier_method_label <- if (active_outlier_col == \"OUTLIER_MAGIC_GLASSES_COMPLETE\") \"MAGIC_GLASSES_COMPLETE\" else \"MAGIC_GLASSES_PARTIAL\"\n", - "detected_tbl <- flagged_outliers_mg[, .(\n", - " PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE,\n", - " OUTLIER_DETECTED = get(active_outlier_col),\n", - " OUTLIER_METHOD = outlier_method_label\n", - ")]\n", - "detected_tbl[is.na(OUTLIER_DETECTED), OUTLIER_DETECTED := FALSE]\n", - "detected_tbl <- merge(detected_tbl, unique(pyramid_names), by = c(\"ADM1_ID\", \"ADM2_ID\", \"OU_ID\"), all.x = TRUE)\n", - "detected_tbl[, DATE := as.Date(sprintf(\"%04d-%02d-01\", YEAR, MONTH))]\n", - "arrow::write_parquet(detected_tbl, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_detected.parquet\")))\n", - "n_out <- sum(detected_tbl$OUTLIER_DETECTED == TRUE)\n", - "log_msg(glue::glue(\"Exported full detection table ({nrow(detected_tbl)} rows, {n_out} outliers) to {COUNTRY_CODE}_routine_outliers_detected.parquet\"))\n", - "\n", - "# 2) Imputed routine data (same moving-average logic as other outlier pipelines)\n", - "imputed_long <- copy(flagged_outliers_mg)\n", - "setorder(imputed_long, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH)\n", - "imputed_long[, TO_IMPUTE := fifelse(get(active_outlier_col) == TRUE, NA_real_, VALUE)]\n", - "imputed_long[\n", - " , MOVING_AVG := frollapply(\n", - " TO_IMPUTE,\n", - " n = 3,\n", - " FUN = function(x) ceiling(mean(x, na.rm = TRUE)),\n", - " align = \"center\"\n", - " ),\n", - " by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR)\n", - "]\n", - "imputed_long[, VALUE_IMPUTED := fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)]\n", - "imputed_long[, VALUE := VALUE_IMPUTED]\n", - "imputed_long[, c(\"TO_IMPUTE\", \"MOVING_AVG\", \"VALUE_IMPUTED\") := NULL]\n", - "\n", - "routine_imputed <- to_routine_wide(\n", - " dt_long = imputed_long,\n", + "export_magic_glasses_outputs(\n", + " dhis2_routine_long = dhis2_routine_long,\n", + " flagged_outliers_mad15_mad10 = flagged_outliers_mad15_mad10,\n", + " flagged_outliers_seasonal5_seasonal3 = flagged_outliers_seasonal5_seasonal3,\n", + " run_complete = RUN_MAGIC_GLASSES_COMPLETE,\n", + " dhis2_routine = dhis2_routine,\n", " fixed_cols = fixed_cols,\n", " indicators_to_keep = indicators_to_keep,\n", - " pyramid_names = pyramid_names\n", - ")\n", - "arrow::write_parquet(routine_imputed, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_imputed.parquet\")))\n", - "log_msg(glue::glue(\"Exported routine imputed table to {COUNTRY_CODE}_routine_outliers_imputed.parquet\"))\n", - "\n", - "# 3) Removed routine data\n", - "# We set outlier values to NA (we do not remove rows). The routine data keeps the same structure.\n", - "removed_long <- copy(flagged_outliers_mg)\n", - "removed_long[get(active_outlier_col) == TRUE, VALUE := NA_real_]\n", - "\n", - "routine_removed <- to_routine_wide(\n", - " dt_long = removed_long,\n", - " fixed_cols = fixed_cols,\n", - " indicators_to_keep = indicators_to_keep,\n", - " pyramid_names = pyramid_names\n", - ")\n", - "arrow::write_parquet(routine_removed, file.path(OUTPUT_DIR, paste0(COUNTRY_CODE, \"_routine_outliers_removed.parquet\")))\n", - "log_msg(glue::glue(\"Exported routine removed table to {COUNTRY_CODE}_routine_outliers_removed.parquet\"))\n", - "\n", - "log_msg(\"MG outlier tables exported successfully.\")" + " output_dir = OUTPUT_DIR,\n", + " country_code = COUNTRY_CODE\n", + ")" ] } ], diff --git a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r index 28ec619..8cd4bb4 100644 --- a/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r +++ b/pipelines/snt_dhis2_outliers_imputation_magic_glasses/utils/snt_dhis2_outliers_imputation_magic_glasses.r @@ -17,6 +17,9 @@ bootstrap_magic_glasses_context <- function( if (load_openhexa) { openhexa <- reticulate::import("openhexa.sdk") } + # snt_utils::log_msg() relies on a global `openhexa` object. + # Expose it before any helper function logs messages. + assign("openhexa", openhexa, envir = .GlobalEnv) return(list( ROOT_PATH = root_path, @@ -125,3 +128,296 @@ to_routine_wide <- function(dt_long, fixed_cols, indicators_to_keep, pyramid_nam routine_wide } +prepare_magic_glasses_input <- function( + root_path, + config_file_name = "SNT_config.json", + run_complete = FALSE, + seasonal_workers = 1, + dev_subset = FALSE, + dev_subset_adm1_n = 2 +) { + required_packages <- c("arrow", "data.table", "jsonlite", "reticulate", "glue") + if (run_complete) { + required_packages <- c(required_packages, "forecast") + } + if (run_complete && seasonal_workers > 1) { + required_packages <- c(required_packages, "future", "future.apply") + } + + setup_ctx <- bootstrap_magic_glasses_context( + root_path = root_path, + required_packages = required_packages + ) + + if (run_complete) { + log_msg("[WARNING] Complete mode: seasonal detection is very computationally intensive and can take several hours to run.", "warning") + } + + if (run_complete && seasonal_workers > 1) { + future::plan(future::multisession, workers = seasonal_workers) + log_msg(glue::glue("Using parallel seasonal detection with {seasonal_workers} workers")) + } + + config_json <- jsonlite::fromJSON(file.path(setup_ctx$CONFIG_PATH, config_file_name)) + + snt_config_mandatory <- c("COUNTRY_CODE", "DHIS2_ADMINISTRATION_1", "DHIS2_ADMINISTRATION_2") + for (conf in snt_config_mandatory) { + if (is.null(config_json$SNT_CONFIG[[conf]])) { + msg <- paste("Missing configuration input:", conf) + log_msg(msg) + stop(msg) + } + } + + country_code <- config_json$SNT_CONFIG$COUNTRY_CODE + fixed_cols <- c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID") + indicators_to_keep <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) + + dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED + dhis2_routine <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine data file for {country_code} : {conditionMessage(e)}") + log_msg(msg) + stop(msg) + } + ) + log_msg(glue::glue("DHIS2 routine data loaded from dataset : {dataset_name}")) + log_msg(glue::glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns.")) + + if (all(c("YEAR", "MONTH") %in% names(dhis2_routine))) { + dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.integer) + } + + cols_to_select <- intersect(c(fixed_cols, indicators_to_keep), names(dhis2_routine)) + dt_routine <- data.table::as.data.table(dhis2_routine)[, ..cols_to_select] + + dhis2_routine_long <- data.table::melt( + dt_routine, + id.vars = intersect(fixed_cols, names(dt_routine)), + measure.vars = intersect(indicators_to_keep, names(dt_routine)), + variable.name = "INDICATOR", + value.name = "VALUE", + variable.factor = FALSE + ) + + dup_keys <- c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "YEAR", "MONTH", "INDICATOR") + dup_keys <- intersect(dup_keys, names(dhis2_routine_long)) + if (length(dup_keys) > 0) { + duplicated <- dhis2_routine_long[, .N, by = dup_keys][N > 1L] + if (nrow(duplicated) > 0) { + log_msg(glue::glue("Removing {nrow(duplicated)} duplicated values.")) + data.table::setkeyv(dhis2_routine_long, dup_keys) + dhis2_routine_long <- unique(dhis2_routine_long) + } + } + + if (dev_subset) { + unique_adm1 <- unique(dhis2_routine_long$ADM1_ID) + adm1_to_keep <- unique_adm1[seq_len(min(dev_subset_adm1_n, length(unique_adm1)))] + dhis2_routine_long <- dhis2_routine_long[ADM1_ID %in% adm1_to_keep] + log_msg(glue::glue("DEV_SUBSET enabled: keeping {length(adm1_to_keep)} ADM1 values"), "warning") + } + + log_msg(glue::glue("Data loaded: {nrow(dhis2_routine_long)} rows, {length(unique(dhis2_routine_long$OU_ID))} facilities")) + + if (run_complete) { + n_groups <- data.table::uniqueN(dhis2_routine_long[, .(OU_ID, INDICATOR)]) + log_msg(glue::glue("Complete mode active: seasonal detection will run on up to {n_groups} OU_ID x INDICATOR time series."), "warning") + } else { + log_msg("Partial mode active: seasonal detection is skipped.") + } + + list( + setup_ctx = setup_ctx, + config_json = config_json, + country_code = country_code, + fixed_cols = fixed_cols, + indicators_to_keep = indicators_to_keep, + dhis2_routine = dhis2_routine, + dhis2_routine_long = dhis2_routine_long + ) +} + +run_magic_glasses_outlier_detection <- function( + dhis2_routine_long, + deviation_mad15 = 15, + deviation_mad10 = 10, + run_complete = FALSE, + deviation_seasonal5 = 5, + deviation_seasonal3 = 3, + seasonal_workers = 1 +) { + log_msg("Starting MAD15 detection...") + flagged_outliers_mad15 <- detect_outliers_mad_custom(dhis2_routine_long, deviation_mad15) + flagged_outliers_mad15_filtered <- flagged_outliers_mad15[OUTLIER_MAD15 == FALSE] + + log_msg("Starting MAD10 detection...") + flagged_outliers_mad10 <- detect_outliers_mad_custom(flagged_outliers_mad15_filtered, deviation_mad10) + data.table::setnames(flagged_outliers_mad10, paste0("OUTLIER_MAD", deviation_mad10), "OUTLIER_MAD15_MAD10") + + join_cols <- c("PERIOD", "OU_ID", "INDICATOR") + mad10_subset <- flagged_outliers_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)] + flagged_outliers_mad15_mad10 <- merge( + flagged_outliers_mad15, + mad10_subset, + by = join_cols, + all.x = TRUE + ) + flagged_outliers_mad15_mad10[is.na(OUTLIER_MAD15_MAD10), OUTLIER_MAD15_MAD10 := TRUE] + log_msg(glue::glue("MAD partial done: {sum(flagged_outliers_mad15_mad10$OUTLIER_MAD15_MAD10)} outliers flagged")) + + flagged_outliers_seasonal5_seasonal3 <- NULL + if (run_complete) { + flagged_outliers_mad15_mad10_filtered <- flagged_outliers_mad15_mad10[OUTLIER_MAD15_MAD10 == FALSE] + + if (nrow(flagged_outliers_mad15_mad10_filtered) == 0) { + log_msg("No rows left after MAD partial filtering; seasonal step will be skipped.", "warning") + flagged_outliers_seasonal5 <- data.table::copy(flagged_outliers_mad15_mad10_filtered) + flagged_outliers_seasonal5[, OUTLIER_SEASONAL5 := FALSE] + flagged_outliers_seasonal3 <- data.table::copy(flagged_outliers_seasonal5) + flagged_outliers_seasonal3[, OUTLIER_SEASONAL3 := FALSE] + } else { + log_msg(glue::glue("Starting SEASONAL5 detection on {nrow(flagged_outliers_mad15_mad10_filtered)} rows...")) + t_seasonal5 <- system.time({ + flagged_outliers_seasonal5 <- detect_seasonal_outliers( + flagged_outliers_mad15_mad10_filtered, + deviation = deviation_seasonal5, + workers = seasonal_workers + ) + }) + flagged_outliers_seasonal5_filtered <- flagged_outliers_seasonal5[OUTLIER_SEASONAL5 == FALSE] + log_msg(glue::glue("SEASONAL5 finished in {round(t_seasonal5['elapsed'], 1)}s. Remaining rows: {nrow(flagged_outliers_seasonal5_filtered)}")) + + log_msg(glue::glue("Starting SEASONAL3 detection on {nrow(flagged_outliers_seasonal5_filtered)} rows...")) + t_seasonal3 <- system.time({ + flagged_outliers_seasonal3 <- detect_seasonal_outliers( + flagged_outliers_seasonal5_filtered, + deviation = deviation_seasonal3, + workers = seasonal_workers + ) + }) + log_msg(glue::glue("SEASONAL3 finished in {round(t_seasonal3['elapsed'], 1)}s.")) + } + + data.table::setnames(flagged_outliers_seasonal3, paste0("OUTLIER_SEASONAL", deviation_seasonal3), "OUTLIER_SEASONAL5_SEASONAL3") + + seasonal3_subset <- flagged_outliers_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)] + flagged_outliers_seasonal5_seasonal3 <- merge( + flagged_outliers_seasonal5, + seasonal3_subset, + by = join_cols, + all.x = TRUE + ) + flagged_outliers_seasonal5_seasonal3[is.na(OUTLIER_SEASONAL5_SEASONAL3), OUTLIER_SEASONAL5_SEASONAL3 := TRUE] + log_msg(glue::glue("SEASONAL complete done: {sum(flagged_outliers_seasonal5_seasonal3$OUTLIER_SEASONAL5_SEASONAL3)} outliers flagged")) + } + + list( + flagged_outliers_mad15_mad10 = flagged_outliers_mad15_mad10, + flagged_outliers_seasonal5_seasonal3 = flagged_outliers_seasonal5_seasonal3 + ) +} + +export_magic_glasses_outputs <- function( + dhis2_routine_long, + flagged_outliers_mad15_mad10, + flagged_outliers_seasonal5_seasonal3, + run_complete, + dhis2_routine, + fixed_cols, + indicators_to_keep, + output_dir, + country_code +) { + base_cols <- intersect(c(fixed_cols, "INDICATOR", "VALUE"), names(dhis2_routine_long)) + flagged_outliers_mg <- data.table::copy(dhis2_routine_long[, ..base_cols]) + join_cols <- c("PERIOD", "OU_ID", "INDICATOR") + + partial_subset <- flagged_outliers_mad15_mad10[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_MAD15_MAD10)] + flagged_outliers_mg <- merge(flagged_outliers_mg, partial_subset, by = join_cols, all.x = TRUE) + data.table::setnames(flagged_outliers_mg, "OUTLIER_MAD15_MAD10", "OUTLIER_MAGIC_GLASSES_PARTIAL") + + if (run_complete && !is.null(flagged_outliers_seasonal5_seasonal3)) { + complete_subset <- flagged_outliers_seasonal5_seasonal3[, .(PERIOD, OU_ID, INDICATOR, OUTLIER_SEASONAL5_SEASONAL3)] + flagged_outliers_mg <- merge(flagged_outliers_mg, complete_subset, by = join_cols, all.x = TRUE) + data.table::setnames(flagged_outliers_mg, "OUTLIER_SEASONAL5_SEASONAL3", "OUTLIER_MAGIC_GLASSES_COMPLETE") + flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE) & OUTLIER_MAGIC_GLASSES_PARTIAL == TRUE, OUTLIER_MAGIC_GLASSES_COMPLETE := TRUE] + } + + flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_PARTIAL), OUTLIER_MAGIC_GLASSES_PARTIAL := FALSE] + if ("OUTLIER_MAGIC_GLASSES_COMPLETE" %in% names(flagged_outliers_mg)) { + flagged_outliers_mg[is.na(OUTLIER_MAGIC_GLASSES_COMPLETE), OUTLIER_MAGIC_GLASSES_COMPLETE := FALSE] + } + + active_outlier_col <- if (run_complete && "OUTLIER_MAGIC_GLASSES_COMPLETE" %in% names(flagged_outliers_mg)) { + "OUTLIER_MAGIC_GLASSES_COMPLETE" + } else { + "OUTLIER_MAGIC_GLASSES_PARTIAL" + } + + if (!(active_outlier_col %in% names(flagged_outliers_mg))) { + stop(glue::glue("Expected outlier flag column not found: {active_outlier_col}")) + } + + pyramid_names <- unique(data.table::as.data.table(dhis2_routine)[, .( + ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, OU_ID, OU_NAME + )]) + + outlier_method_label <- if (active_outlier_col == "OUTLIER_MAGIC_GLASSES_COMPLETE") "MAGIC_GLASSES_COMPLETE" else "MAGIC_GLASSES_PARTIAL" + detected_tbl <- flagged_outliers_mg[, .( + PERIOD, YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE, + OUTLIER_DETECTED = get(active_outlier_col), + OUTLIER_METHOD = outlier_method_label + )] + detected_tbl[is.na(OUTLIER_DETECTED), OUTLIER_DETECTED := FALSE] + detected_tbl <- merge(detected_tbl, unique(pyramid_names), by = c("ADM1_ID", "ADM2_ID", "OU_ID"), all.x = TRUE) + detected_tbl[, DATE := as.Date(sprintf("%04d-%02d-01", YEAR, MONTH))] + arrow::write_parquet(detected_tbl, file.path(output_dir, paste0(country_code, "_routine_outliers_detected.parquet"))) + n_out <- sum(detected_tbl$OUTLIER_DETECTED == TRUE) + log_msg(glue::glue("Exported full detection table ({nrow(detected_tbl)} rows, {n_out} outliers) to {country_code}_routine_outliers_detected.parquet")) + + imputed_long <- data.table::copy(flagged_outliers_mg) + data.table::setorder(imputed_long, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, PERIOD, YEAR, MONTH) + imputed_long[, TO_IMPUTE := data.table::fifelse(get(active_outlier_col) == TRUE, NA_real_, VALUE)] + imputed_long[ + , + MOVING_AVG := data.table::frollapply( + TO_IMPUTE, + n = 3, + FUN = function(x) ceiling(mean(x, na.rm = TRUE)), + align = "center" + ), + by = .(ADM1_ID, ADM2_ID, OU_ID, INDICATOR) + ] + imputed_long[, VALUE_IMPUTED := data.table::fifelse(is.na(TO_IMPUTE), MOVING_AVG, TO_IMPUTE)] + imputed_long[, VALUE := VALUE_IMPUTED] + imputed_long[, c("TO_IMPUTE", "MOVING_AVG", "VALUE_IMPUTED") := NULL] + + routine_imputed <- to_routine_wide( + dt_long = imputed_long, + fixed_cols = fixed_cols, + indicators_to_keep = indicators_to_keep, + pyramid_names = pyramid_names + ) + arrow::write_parquet(routine_imputed, file.path(output_dir, paste0(country_code, "_routine_outliers_imputed.parquet"))) + log_msg(glue::glue("Exported routine imputed table to {country_code}_routine_outliers_imputed.parquet")) + + removed_long <- data.table::copy(flagged_outliers_mg) + removed_long[get(active_outlier_col) == TRUE, VALUE := NA_real_] + + routine_removed <- to_routine_wide( + dt_long = removed_long, + fixed_cols = fixed_cols, + indicators_to_keep = indicators_to_keep, + pyramid_names = pyramid_names + ) + arrow::write_parquet(routine_removed, file.path(output_dir, paste0(country_code, "_routine_outliers_removed.parquet"))) + log_msg(glue::glue("Exported routine removed table to {country_code}_routine_outliers_removed.parquet")) + + log_msg("MG outlier tables exported successfully.") + invisible(list(active_outlier_col = active_outlier_col)) +} + From e0ee49a524028ce4c75e59c81fdb795f5a1bfeb8 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Mon, 30 Mar 2026 11:32:58 +0200 Subject: [PATCH 15/23] generic functions in snt_utils --- code/snt_utils.r | 5 + .../code/NER_pyramid_format.ipynb | 1227 ++++---- .../reporting/snt_dhis2_extract_report.ipynb | 2426 +++++++-------- .../utils/snt_dhis2_extract.r | 73 + .../snt_dhis2_formatting_report.ipynb | 2755 ++++++++--------- .../utils/snt_dhis2_formatting_report.r | 24 + 6 files changed, 3313 insertions(+), 3197 deletions(-) create mode 100644 pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r create mode 100644 pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r diff --git a/code/snt_utils.r b/code/snt_utils.r index 838e276..ba93d28 100644 --- a/code/snt_utils.r +++ b/code/snt_utils.r @@ -232,6 +232,11 @@ get_latest_dataset_file_in_memory <- function(dataset, filename) { } +# helper function for OpenHEXA logging +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + # helper function for OpenHEXA logging log_msg <- function(msg , level="info") { print(msg) diff --git a/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb b/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb index 9d3e1d2..10aa7c9 100644 --- a/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb +++ b/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb @@ -1,581 +1,650 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "546af596-8d71-45c8-8d66-d391ff0ab4c4", - "metadata": {}, - "source": [ - "## Specific fix for mixed organisation units Niger hierarchy \n", - "\n", - "These transformations make sense given the original format used to extract organisation units from DHIS2. \n", - "Each row corresponds to a DHIS2 organisation unit with its complete hierarchy.\n", - "\n", - "**Steps:** \n", - " \n", - "-Load raw pyramid extracted from DHIS2. \n", - "-Load Organisation units groups that defines the target health facilities that should be in level 6 (this is our **prioritary list**). \n", - "-Parent references for OUs moved from level 3 to level 6, must be updated (district location), for this we use coordinates.\n", - "-Missing coordinates at level 3 are completed manually (coordinates extracted from Google Maps (lat, lon).\n", - "-We update any OUs from the **prioritary list** at level 4 that had their parent modified in the level 3 modifications (set district), and move them to level 6.\n", - "-We update any OUs form the **prioritary list** at level 5 that had their parent modified in the level 3 modifications (set district), and move them to level 6.\n", - "-Add the newly build **level 6** to the final table (includes all previous OUs moved to level 6). \n", - "-Save outputs to the expected output filename (parameter).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "82eced35-0826-4fe2-a047-7451b41a42d3", - "metadata": {}, - "outputs": [], - "source": [ - "# PROJECT PATHS\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", - "\n", - "# Load snt utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# Load libraries\n", - "required_packages <- c(\"arrow\", \"dplyr\", \"tidyverse\", \"jsonlite\", \"reticulate\", \"glue\", \"sf\")\n", - "install_and_load(required_packages)\n", - "\n", - "# # Load openhexa.sdk and set environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "openhexa_toolbox <- import(\"openhexa.toolbox\")" - ] - }, - { - "cell_type": "markdown", - "id": "e39b3d5b-2996-45a4-9d49-e1fe3261d763", - "metadata": {}, - "source": [ - "### Read the raw pyramid file (parquet)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8814550c-f03d-4a75-9f91-d922d74acd96", - "metadata": {}, - "outputs": [], - "source": [ - "# Check if parameters is available\n", - "if (!(\"INPUT_PATH\" %in% ls())) stop(\"[WARNING] Input parameter 'INPUT_PATH' not found. Exiting notebook.\")\n", - "if (!(\"OUTPUT_PATH\" %in% ls())) stop(\"[WARNING] Input parameter 'OUTPUT_PATH' not found. Exiting notebook.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a04337d-0f4f-4491-9667-65eb7eb112b1", - "metadata": {}, - "outputs": [], - "source": [ - "# Read specific file using INPUT_PATH parameter\n", - "pyramid_df <- tryCatch({ read_parquet(file.path(INPUT_PATH)) },\n", - " error = function(e) {\n", - " # Handle errors\n", - " msg <- glue::glue(\"[WARNING] Error reading Parquet file at {INPUT_PATH}\")\n", - " stop(msg) \n", - " })\n", - "\n", - "print(dim(pyramid_df))\n", - "head(pyramid_df %>% select(-geometry), 3) # geometry column is too long to print.\n", - "log_msg(glue::glue(\"NER organisation units transformation: {dim(pyramid_df)[[1]]} Total organisation units.\"))" - ] - }, - { - "cell_type": "markdown", - "id": "822f7192-ba96-4557-990e-2460b3193a81", - "metadata": {}, - "source": [ - "### Load organisation units groups \n", - "\n", - "For details about this **prioritay list** (liste_groupes_prioritaires), see Stephan's code: code in pipelines/snt_dhis2_extract/dev/etl_extract_orgUnit_v2.ipynb)\n", - " \n", - "**Ticket reference**: https://bluesquare.atlassian.net/browse/SNT25-253" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f879733-8dfc-45d9-a2f4-ebbf95b62a0e", - "metadata": {}, - "outputs": [], - "source": [ - "# group selection ids \n", - "liste_groupes_prioritaires = c('EDbDMbIQtPD', 'iGLtZMdDGMD', 'KX9EuY75nGE', 'bZlIiMRLRbA', 'Ag0dMMJp4mH', 'S6YdxQgX8SO', \n", - " 'sB6YOzTUHkF', 'oihgQahh9LH', 'DrQFMU6RoCG', 'pwD7FU7Qfyz', 'dgDPQhxqOcJ','Gox5G2BIGBf')\n", - "\n", - "# Create ou groups\n", - "ou_groups = read_parquet(file.path(SNT_ROOT_PATH, \"data/dhis2/extracts_raw/organisation_unit_groups/NER_organisation_unit_groups_raw.parquet\")) # hardcoded\n", - "ou_groups_exploded <- unnest(ou_groups, cols = c(organisation_units)) \n", - "ou_selection <- ou_groups_exploded[ou_groups_exploded$id %in% liste_groupes_prioritaires, ]\n", - "group_prioritaires_table <- pyramid_df[pyramid_df$id %in% unique(ou_selection$organisation_units), ]" - ] - }, - { - "cell_type": "markdown", - "id": "c15a61a1-8989-48ee-a76b-f6d5f906fb0d", - "metadata": {}, - "source": [ - "### Complete coordinates for missing facilities (Manual fix)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d59e67d-c9ad-420a-b416-706b493fb737", - "metadata": {}, - "outputs": [], - "source": [ - "# Org units with missing geometries\n", - "prioritaires_geo_na <- group_prioritaires_table[group_prioritaires_table$level==3, ] %>% filter(is.na(geometry)) # OU with NA\n", - "dim(prioritaires_geo_na)\n", - "head(prioritaires_geo_na, 3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d1ed93d-bf83-4d34-a59a-da43d7a0b4fd", - "metadata": {}, - "outputs": [], - "source": [ - "# Create point\n", - "make_point_geojson <- function(lat, lon) {\n", - " sprintf('{\"type\": \"Point\", \"coordinates\": [%f, %f]}', lon, lat)\n", - "}\n", - "\n", - "# xMqXanPgczy : Centre Sante Mère Enfant Tillaberi\n", - "group_prioritaires_table[group_prioritaires_table$id==\"xMqXanPgczy\",]$geometry <- make_point_geojson(14.212177799561589, 1.4625739941131144) \n", - "# sgO4yBg59SJ :\tHRM Maradi\t\n", - "group_prioritaires_table[group_prioritaires_table$id==\"sgO4yBg59SJ\",]$geometry <- make_point_geojson(13.485271755127068, 7.143422105623865)\n", - "# oHRvIBeR5xH : Hopital Cure\n", - "group_prioritaires_table[group_prioritaires_table$id==\"oHRvIBeR5xH\",]$geometry <- make_point_geojson(13.551421362165923, 2.116344191939423) \n", - "# TVaP0vBLvat : Hopital ophtalmo Makka\n", - "group_prioritaires_table[group_prioritaires_table$id==\"TVaP0vBLvat\",]$geometry <- make_point_geojson(13.509657990942971, 2.1473435456528174)\n", - "# evMtQ7bLFYI : Hôpital Général de Référence \n", - "group_prioritaires_table[group_prioritaires_table$id==\"evMtQ7bLFYI\",]$geometry <- make_point_geojson(13.586255600670649, 2.0918749136394097) \n", - "# u3xCSh4hG9Q :\tHôpital Ophtalmologique Makkah \n", - "group_prioritaires_table[group_prioritaires_table$id==\"u3xCSh4hG9Q\",]$geometry <- make_point_geojson(13.509793678687808, 2.147386518669057)\n", - "# P1oyCQT39rj : Hôpital de l'Amıtıé Nıger Turquıe\n", - "group_prioritaires_table[group_prioritaires_table$id==\"P1oyCQT39rj\",]$geometry <- make_point_geojson(13.535431049590938, 2.09186651126039)" - ] - }, - { - "cell_type": "markdown", - "id": "21dac923-d598-4f1d-b30b-fa2b289544f0", - "metadata": {}, - "source": [ - "### Start building the re-arranged pyramid" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba68d5c8-fcea-4439-91a6-7f37e9d94393", - "metadata": {}, - "outputs": [], - "source": [ - "# Re-arrange pyramid structure\n", - "new_pyramid <- pyramid_df[0, ]\n", - "new_pyramid_1 <- rbind(new_pyramid, pyramid_df[pyramid_df$level == 1, ])\n", - "new_pyramid_2 <- rbind(new_pyramid_1, pyramid_df[pyramid_df$level == 2, ])\n", - "\n", - "# handle level 3 (select only Districts)\n", - "new_pyramid_3 <- rbind(new_pyramid_2, pyramid_df[pyramid_df$level == 3 & grepl(\"DS\", pyramid_df$name), ])\n", - "group_prioritaires_level_3 <- group_prioritaires_table[group_prioritaires_table$level == 3, ] # org units being moved to level 6" - ] - }, - { - "cell_type": "markdown", - "id": "43d23831-9647-40af-986c-e770f8eb3c2c", - "metadata": {}, - "source": [ - "Prepare points and polygons to match" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7701fb3b-c5dc-42c9-a2af-fab615e06a65", - "metadata": {}, - "outputs": [], - "source": [ - "# Filter level 3 polygons and convert to char\n", - "polygons_level3 <- pyramid_df[pyramid_df$level == 3, ]\n", - "polygons_level3$geometry <- as.character(polygons_level3$geometry)\n", - "group_prioritaires_level_3$geometry <- as.character(group_prioritaires_level_3$geometry)\n", - "\n", - "# Keep only non-empty, non-NA geometries\n", - "polygons_level3 <- polygons_level3[!is.na(polygons_level3$geometry) & polygons_level3$geometry != \"\", ]\n", - "points_level_6 <- group_prioritaires_level_3[!is.na(group_prioritaires_level_3$geometry) & group_prioritaires_level_3$geometry != \"\", ]\n", - "\n", - "# Disable S2 (assume planar coordinates)\n", - "sf_use_s2(TRUE)\n", - "\n", - "# Convert to sf and validate poligons\n", - "polygons_sf <- st_as_sf(polygons_level3, geometry = st_as_sfc(polygons_level3$geometry, GeoJSON = TRUE), crs = 4326)\n", - "points_sf <- st_as_sf(points_level_6, geometry = st_as_sfc(points_level_6$geometry, GeoJSON = TRUE), crs = 4326)\n", - "polygons_sf$geometry <- st_make_valid(polygons_sf$geometry) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04931248-bea3-41ca-b4f9-c0ba64d4d342", - "metadata": {}, - "outputs": [], - "source": [ - "# check\n", - "if (nrow(group_prioritaires_level_3) != nrow(points_sf)) {\n", - " log_msg(glue(\"Check whether all organisation units in the priority list have valid coordinates.\"), \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "2bdb280c-fead-4cba-98d7-a1a846aa18ec", - "metadata": {}, - "source": [ - "Find the corresponding district (DS) at level 3 using the coordinates" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6bfdf9c-55fe-41d6-89ab-c062a5c54f30", - "metadata": {}, - "outputs": [], - "source": [ - "# inside_matrix: rows = points, columns = polygons\n", - "inside_matrix <- st_within(points_sf, polygons_sf, sparse = FALSE)\n", - "\n", - "# Check if points fall in a polygon\n", - "point_polygon_dict <- list()\n", - "for (i in seq_len(nrow(points_sf))) {\n", - " point_id <- points_sf$id[[i]]\n", - " point_name <- points_sf$name[[i]]\n", - " \n", - " # Which polygons contain this point\n", - " polygons_containing <- which(inside_matrix[i, ])\n", - " \n", - " if (length(polygons_containing) > 0) {\n", - " found_polygons <- polygons_sf[polygons_containing, ] \n", - " found_polygons_ds <- found_polygons[grepl(\"^DS\", found_polygons$name), ]\n", - " \n", - " if (nrow(found_polygons_ds) >= 1) { \n", - " polygon_id <- found_polygons_ds$id[1] # select the first match\n", - " polygon_name <- found_polygons_ds$name[1] \n", - " \n", - " # store in list\n", - " point_polygon_dict[[point_id]] <- list(\n", - " point_name = point_name,\n", - " polygon_id = polygon_id,\n", - " polygon_name = polygon_name\n", - " ) \n", - " print(glue(\"Point: {point_name} ({point_id}) is inside polygon: {polygon_name} ({polygon_id})\"))\n", - " \n", - " } else { \n", - " point_polygon_dict[[point_id]] <- list(point_name = point_name, polygon_id = NA, polygon_name = NA) \n", - " cat(\"Point:\", point_id, \"is not inside any district (DS) polygon\\n\")\n", - " }\n", - " } else {\n", - " point_polygon_dict[[point_id]] <- list(point_name = point_name, polygon_id = NA, polygon_name = NA)\n", - " cat(\"Point:\", point_id, \"is not inside any district (DS) polygon\\n\")\n", - " }\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "db0c55a0-35c1-4e73-98c3-7a2cfb61db5d", - "metadata": {}, - "source": [ - "Set the facilities under the corresponding district " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22c31f38-fa0f-46a4-a122-40d9154e6a7b", - "metadata": {}, - "outputs": [], - "source": [ - "# Build rows for level 6 (Health facilities)\n", - "new_level_6 <- pyramid_df[0, ]\n", - "new_level_6 <- rbind(new_level_6, group_prioritaires_level_3)\n", - "\n", - "# Set the facilities under the corresponding district \n", - "for (point_id in names(point_polygon_dict)) {\n", - " entry <- point_polygon_dict[[point_id]]\n", - " print(glue(\"Setting {entry$point_name} ({point_id} ) to district : {entry$polygon_name} ({entry$polygon_id})\")) \n", - " new_level_6[new_level_6$id == point_id, ]$level_6_id <- point_id\n", - " new_level_6[new_level_6$id == point_id, ]$level_6_name <- entry$point_name\n", - " new_level_6[new_level_6$id == point_id, ]$level_3_id <- entry$polygon_id\n", - " new_level_6[new_level_6$id == point_id, ]$level_3_name <- entry$polygon_name \n", - " \n", - "}\n", - "new_level_6$level <- 6 # Reset the level\n", - "dim(new_level_6)\n", - "head(new_level_6, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "3a637054-13a5-4874-a3ea-b75dd192ab5e", - "metadata": {}, - "source": [ - "### Fixes for level 4 facilities that got their parent moved to level 6" - ] - }, - { - "cell_type": "markdown", - "id": "3e7791c5-2c2d-439b-8196-8cd35b566c4c", - "metadata": {}, - "source": [ - "Add the level 4 to the `new_pyramid` and update level 4 parents" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d810d844-8d9e-43d2-b8b9-07ec1bc2faf2", - "metadata": {}, - "outputs": [], - "source": [ - "# Are there any org units at level 4 that had the parent at level 3 already moved?\n", - "group_prioritaires_level_4 <- group_prioritaires_table[group_prioritaires_table$level == 4, ]\n", - "\n", - "# Add level 4 to new pyramid (except org units to be moved to lvl 6)\n", - "new_pyramid_4 <- rbind(new_pyramid_3, pyramid_df[pyramid_df$level == 4 & !(pyramid_df$id %in% group_prioritaires_level_4$id), ])\n", - "\n", - "# Run check and fix\n", - "child_lvl_4 <- get_updated_children(\n", - " new_level_table=new_level_6,\n", - " group_table=group_prioritaires_level_4, \n", - " level = 4,\n", - " target_level=6,\n", - " parent_level=3\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "872d0295-bc4a-4809-93a7-a94c8a1030ec", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(child_lvl_4) == 0) {\n", - " log_msg(glue(\"There are no facilities at level 4 that needed an update to their parent reference.\"))\n", - "} else {\n", - " log_msg(glue(\"{nrow(child_lvl_4)} facilities had a parent reference updated.\")) \n", - "}\n", - "\n", - "# Move level 4 to level 6\n", - "group_prioritaires_level_4$level <- 6\n", - "group_prioritaires_level_4$level_6_id <- group_prioritaires_level_4$level_4_id\n", - "group_prioritaires_level_4$level_6_name <- group_prioritaires_level_4$level_4_name\n", - "group_prioritaires_level_4$level_4_id <- NA\n", - "group_prioritaires_level_4$level_4_name <- NA\n", - "\n", - "# update group 4\n", - "group_prioritaires_level_4_clean <- group_prioritaires_level_4[!group_prioritaires_level_4$id %in% child_lvl_4$id, ]\n", - "group_prioritaires_level_4_updated <- rbind(group_prioritaires_level_4_clean, child_lvl_4) \n", - "\n", - "new_level_6 <- rbind(new_level_6, group_prioritaires_level_4_updated) \n", - " \n", - "dim(new_level_6)\n", - "tail(new_level_6, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "b3b9936f-7ab2-4c4c-b8ed-6f648b24cf02", - "metadata": {}, - "source": [ - "### Run check for level 5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5a025e7-9d4c-4e76-bfd5-fe2011f6641d", - "metadata": {}, - "outputs": [], - "source": [ - "# select org units at level 5 to be moved -> 6\n", - "group_prioritaires_level_5 <- group_prioritaires_table[group_prioritaires_table$level == 5, ]\n", - "\n", - "# Add level 5 to new pyramid (except org units to be moved to lvl 6)\n", - "new_pyramid_5 <- rbind(new_pyramid_4, pyramid_df[pyramid_df$level == 5 & !(pyramid_df$id %in% group_prioritaires_level_5$id), ])\n", - "\n", - "# Run check and fix\n", - "child_lvl_5 <- get_updated_children(\n", - " new_level_table=new_level_6,\n", - " group_table=group_prioritaires_level_5, \n", - " level = 5,\n", - " target_level=6,\n", - " parent_level=3\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c5ece57-2373-43cb-8f93-0d83c0f916f0", - "metadata": {}, - "outputs": [], - "source": [ - "dim(new_pyramid_5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "514f30e9-5a95-4835-8872-1d837a4d07ef", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(child_lvl_5) == 0) {\n", - " log_msg(glue(\"There are no facilities at level 5 that needed an update to their parent reference.\"))\n", - "} else {\n", - " log_msg(glue(\"{nrow(child_lvl_5)} facilities had a parent reference updated.\")) \n", - "}\n", - "\n", - "# Move level 5 to level 6\n", - "group_prioritaires_level_5$level <- 6\n", - "group_prioritaires_level_5$level_6_id <- group_prioritaires_level_5$level_5_id\n", - "group_prioritaires_level_5$level_6_name <- group_prioritaires_level_5$level_5_name\n", - "group_prioritaires_level_5$level_5_id <- NA\n", - "group_prioritaires_level_5$level_5_name <- NA\n", - "\n", - "# update group 5\n", - "group_prioritaires_level_5_clean <- group_prioritaires_level_5[!group_prioritaires_level_5$id %in% child_lvl_5$id, ]\n", - "group_prioritaires_level_5_updated <- rbind(group_prioritaires_level_5_clean, child_lvl_5) \n", - "\n", - "new_level_6 <- rbind(new_level_6, group_prioritaires_level_5_updated) \n", - " \n", - "dim(new_level_6)\n", - "tail(new_level_6, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "3c031cdd-1191-4c50-b9dd-aafe9a8d9c53", - "metadata": {}, - "source": [ - "### Select and add level 6 org units" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "385a933b-18b7-4239-b0eb-2ad21f1c3aad", - "metadata": {}, - "outputs": [], - "source": [ - "# select org units at level 5 to be moved -> 6\n", - "group_prioritaires_level_6 <- group_prioritaires_table[group_prioritaires_table$level == 6, ]\n", - "\n", - "# Add level 6 to new pyramid \n", - "# Comment this line if we want only to include the org units in the list\n", - "new_pyramid_6 <- rbind(new_pyramid_5, pyramid_df[pyramid_df$level == 6 & !(pyramid_df$id %in% group_prioritaires_level_6$id), ])\n", - "\n", - "new_level_6 <- rbind(new_level_6, group_prioritaires_level_6)\n", - "new_level_6 <- new_level_6[!duplicated(new_level_6$id), ]\n", - "\n", - "dim(new_level_6)\n", - "tail(new_level_6, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "d6e9fcbb-feca-47fd-a028-d9e0423b00c2", - "metadata": {}, - "source": [ - "### Create final pyramid" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "57a1df59-3c18-4a52-af3f-32378db85e13", - "metadata": {}, - "outputs": [], - "source": [ - "# Add level 6 to new pyramid (based on the org units prioritaire list)\n", - "final_pyramid <- rbind(new_pyramid_6, new_level_6)\n", - "dim(final_pyramid)\n", - "tail(final_pyramid, 3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "025a736e-71a7-4f5a-afc7-2b204ecd0b15", - "metadata": {}, - "outputs": [], - "source": [ - "for (level in unique(pyramid_df$level)) {\n", - " count <- nrow(pyramid_df[pyramid_df$level==level, ])\n", - " print(glue(\"Original pyramid OU at level {level}: {count}\"))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b6b8e2a-b70e-43e2-b0db-80fe9b8044a8", - "metadata": {}, - "outputs": [], - "source": [ - "for (level in unique(final_pyramid$level)) {\n", - " count <- nrow(final_pyramid[final_pyramid$level==level, ])\n", - " print(glue(\"New pyramid OU at level {level}: {count}\"))\n", - "} " - ] - }, - { - "cell_type": "markdown", - "id": "b2d7ae0e-7241-4753-b2c9-2a74ca0659f3", - "metadata": {}, - "source": [ - "### Save output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f28a7de-ee1e-418b-8d71-60bcd219ac48", - "metadata": {}, - "outputs": [], - "source": [ - "# Write to parquet\n", - "log_msg(glue::glue(\"NER organisation units transformation: Saving transformed organisation units under {OUTPUT_PATH}.\"))\n", - "write_parquet(final_pyramid, file.path(OUTPUT_PATH))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "546af596-8d71-45c8-8d66-d391ff0ab4c4", + "metadata": {}, + "source": [ + "## Specific fix for mixed organisation units Niger hierarchy \n", + "\n", + "These transformations make sense given the original format used to extract organisation units from DHIS2. \n", + "Each row corresponds to a DHIS2 organisation unit with its complete hierarchy.\n", + "\n", + "**Steps:** \n", + " \n", + "-Load raw pyramid extracted from DHIS2. \n", + "-Load Organisation units groups that defines the target health facilities that should be in level 6 (this is our **prioritary list**). \n", + "-Parent references for OUs moved from level 3 to level 6, must be updated (district location), for this we use coordinates.\n", + "-Missing coordinates at level 3 are completed manually (coordinates extracted from Google Maps (lat, lon).\n", + "-We update any OUs from the **prioritary list** at level 4 that had their parent modified in the level 3 modifications (set district), and move them to level 6.\n", + "-We update any OUs form the **prioritary list** at level 5 that had their parent modified in the level 3 modifications (set district), and move them to level 6.\n", + "-Add the newly build **level 6** to the final table (includes all previous OUs moved to level 6). \n", + "-Save outputs to the expected output filename (parameter).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82eced35-0826-4fe2-a047-7451b41a42d3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# PROJECT PATHS\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code')\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_extract\")\n", + "\n", + "# Load snt utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_extract.r\"))\n", + "\n", + "# Load libraries\n", + "required_packages <- c(\"arrow\", \"dplyr\", \"tidyverse\", \"jsonlite\", \"reticulate\", \"glue\", \"sf\")\n", + "install_and_load(required_packages)\n", + "\n", + "# # Load openhexa.sdk and set environment\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "openhexa_toolbox <- import(\"openhexa.toolbox\")" + ] + }, + { + "cell_type": "markdown", + "id": "e39b3d5b-2996-45a4-9d49-e1fe3261d763", + "metadata": {}, + "source": [ + "### Read the raw pyramid file (parquet)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8814550c-f03d-4a75-9f91-d922d74acd96", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if parameters is available\n", + "if (!(\"INPUT_PATH\" %in% ls())) stop(\"[WARNING] Input parameter 'INPUT_PATH' not found. Exiting notebook.\")\n", + "if (!(\"OUTPUT_PATH\" %in% ls())) stop(\"[WARNING] Input parameter 'OUTPUT_PATH' not found. Exiting notebook.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a04337d-0f4f-4491-9667-65eb7eb112b1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Read specific file using INPUT_PATH parameter\n", + "pyramid_df <- tryCatch({ read_parquet(file.path(INPUT_PATH)) },\n", + " error = function(e) {\n", + " # Handle errors\n", + " msg <- glue::glue(\"[WARNING] Error reading Parquet file at {INPUT_PATH}\")\n", + " stop(msg) \n", + " })\n", + "\n", + "print(dim(pyramid_df))\n", + "head(pyramid_df %>% select(-geometry), 3) # geometry column is too long to print.\n", + "log_msg(glue::glue(\"NER organisation units transformation: {dim(pyramid_df)[[1]]} Total organisation units.\"))" + ] + }, + { + "cell_type": "markdown", + "id": "822f7192-ba96-4557-990e-2460b3193a81", + "metadata": {}, + "source": [ + "### Load organisation units groups \n", + "\n", + "For details about this **prioritay list** (liste_groupes_prioritaires), see Stephan's code: code in pipelines/snt_dhis2_extract/dev/etl_extract_orgUnit_v2.ipynb)\n", + " \n", + "**Ticket reference**: https://bluesquare.atlassian.net/browse/SNT25-253" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f879733-8dfc-45d9-a2f4-ebbf95b62a0e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# group selection ids \n", + "liste_groupes_prioritaires = c('EDbDMbIQtPD', 'iGLtZMdDGMD', 'KX9EuY75nGE', 'bZlIiMRLRbA', 'Ag0dMMJp4mH', 'S6YdxQgX8SO', \n", + " 'sB6YOzTUHkF', 'oihgQahh9LH', 'DrQFMU6RoCG', 'pwD7FU7Qfyz', 'dgDPQhxqOcJ','Gox5G2BIGBf')\n", + "\n", + "# Create ou groups\n", + "ou_groups = read_parquet(file.path(SNT_ROOT_PATH, \"data/dhis2/extracts_raw/organisation_unit_groups/NER_organisation_unit_groups_raw.parquet\")) # hardcoded\n", + "ou_groups_exploded <- unnest(ou_groups, cols = c(organisation_units)) \n", + "ou_selection <- ou_groups_exploded[ou_groups_exploded$id %in% liste_groupes_prioritaires, ]\n", + "group_prioritaires_table <- pyramid_df[pyramid_df$id %in% unique(ou_selection$organisation_units), ]" + ] + }, + { + "cell_type": "markdown", + "id": "c15a61a1-8989-48ee-a76b-f6d5f906fb0d", + "metadata": {}, + "source": [ + "### Complete coordinates for missing facilities (Manual fix)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d59e67d-c9ad-420a-b416-706b493fb737", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Org units with missing geometries\n", + "prioritaires_geo_na <- group_prioritaires_table[group_prioritaires_table$level==3, ] %>% filter(is.na(geometry)) # OU with NA\n", + "dim(prioritaires_geo_na)\n", + "head(prioritaires_geo_na, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d1ed93d-bf83-4d34-a59a-da43d7a0b4fd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Apply manual geometry fixes for known NER facilities\n", + "group_prioritaires_table <- apply_ner_manual_geometry_fixes(group_prioritaires_table)" + ] + }, + { + "cell_type": "markdown", + "id": "21dac923-d598-4f1d-b30b-fa2b289544f0", + "metadata": {}, + "source": [ + "### Start building the re-arranged pyramid" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba68d5c8-fcea-4439-91a6-7f37e9d94393", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Re-arrange pyramid structure\n", + "new_pyramid <- pyramid_df[0, ]\n", + "new_pyramid_1 <- rbind(new_pyramid, pyramid_df[pyramid_df$level == 1, ])\n", + "new_pyramid_2 <- rbind(new_pyramid_1, pyramid_df[pyramid_df$level == 2, ])\n", + "\n", + "# handle level 3 (select only Districts)\n", + "new_pyramid_3 <- rbind(new_pyramid_2, pyramid_df[pyramid_df$level == 3 & grepl(\"DS\", pyramid_df$name), ])\n", + "group_prioritaires_level_3 <- group_prioritaires_table[group_prioritaires_table$level == 3, ] # org units being moved to level 6" + ] + }, + { + "cell_type": "markdown", + "id": "43d23831-9647-40af-986c-e770f8eb3c2c", + "metadata": {}, + "source": [ + "Prepare points and polygons to match" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7701fb3b-c5dc-42c9-a2af-fab615e06a65", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Filter level 3 polygons and convert to char\n", + "polygons_level3 <- pyramid_df[pyramid_df$level == 3, ]\n", + "polygons_level3$geometry <- as.character(polygons_level3$geometry)\n", + "group_prioritaires_level_3$geometry <- as.character(group_prioritaires_level_3$geometry)\n", + "\n", + "# Keep only non-empty, non-NA geometries\n", + "polygons_level3 <- polygons_level3[!is.na(polygons_level3$geometry) & polygons_level3$geometry != \"\", ]\n", + "points_level_6 <- group_prioritaires_level_3[!is.na(group_prioritaires_level_3$geometry) & group_prioritaires_level_3$geometry != \"\", ]\n", + "\n", + "# Disable S2 (assume planar coordinates)\n", + "sf_use_s2(TRUE)\n", + "\n", + "# Convert to sf and validate poligons\n", + "polygons_sf <- st_as_sf(polygons_level3, geometry = st_as_sfc(polygons_level3$geometry, GeoJSON = TRUE), crs = 4326)\n", + "points_sf <- st_as_sf(points_level_6, geometry = st_as_sfc(points_level_6$geometry, GeoJSON = TRUE), crs = 4326)\n", + "polygons_sf$geometry <- st_make_valid(polygons_sf$geometry) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04931248-bea3-41ca-b4f9-c0ba64d4d342", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# check\n", + "if (nrow(group_prioritaires_level_3) != nrow(points_sf)) {\n", + " log_msg(glue(\"Check whether all organisation units in the priority list have valid coordinates.\"), \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "2bdb280c-fead-4cba-98d7-a1a846aa18ec", + "metadata": {}, + "source": [ + "Find the corresponding district (DS) at level 3 using the coordinates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6bfdf9c-55fe-41d6-89ab-c062a5c54f30", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# inside_matrix: rows = points, columns = polygons\n", + "inside_matrix <- st_within(points_sf, polygons_sf, sparse = FALSE)\n", + "\n", + "# Check if points fall in a polygon\n", + "point_polygon_dict <- list()\n", + "for (i in seq_len(nrow(points_sf))) {\n", + " point_id <- points_sf$id[[i]]\n", + " point_name <- points_sf$name[[i]]\n", + " \n", + " # Which polygons contain this point\n", + " polygons_containing <- which(inside_matrix[i, ])\n", + " \n", + " if (length(polygons_containing) > 0) {\n", + " found_polygons <- polygons_sf[polygons_containing, ] \n", + " found_polygons_ds <- found_polygons[grepl(\"^DS\", found_polygons$name), ]\n", + " \n", + " if (nrow(found_polygons_ds) >= 1) { \n", + " polygon_id <- found_polygons_ds$id[1] # select the first match\n", + " polygon_name <- found_polygons_ds$name[1] \n", + " \n", + " # store in list\n", + " point_polygon_dict[[point_id]] <- list(\n", + " point_name = point_name,\n", + " polygon_id = polygon_id,\n", + " polygon_name = polygon_name\n", + " ) \n", + " print(glue(\"Point: {point_name} ({point_id}) is inside polygon: {polygon_name} ({polygon_id})\"))\n", + " \n", + " } else { \n", + " point_polygon_dict[[point_id]] <- list(point_name = point_name, polygon_id = NA, polygon_name = NA) \n", + " cat(\"Point:\", point_id, \"is not inside any district (DS) polygon\\n\")\n", + " }\n", + " } else {\n", + " point_polygon_dict[[point_id]] <- list(point_name = point_name, polygon_id = NA, polygon_name = NA)\n", + " cat(\"Point:\", point_id, \"is not inside any district (DS) polygon\\n\")\n", + " }\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "db0c55a0-35c1-4e73-98c3-7a2cfb61db5d", + "metadata": {}, + "source": [ + "Set the facilities under the corresponding district " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22c31f38-fa0f-46a4-a122-40d9154e6a7b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Build rows for level 6 (Health facilities)\n", + "new_level_6 <- pyramid_df[0, ]\n", + "new_level_6 <- rbind(new_level_6, group_prioritaires_level_3)\n", + "\n", + "# Set the facilities under the corresponding district \n", + "for (point_id in names(point_polygon_dict)) {\n", + " entry <- point_polygon_dict[[point_id]]\n", + " print(glue(\"Setting {entry$point_name} ({point_id} ) to district : {entry$polygon_name} ({entry$polygon_id})\")) \n", + " new_level_6[new_level_6$id == point_id, ]$level_6_id <- point_id\n", + " new_level_6[new_level_6$id == point_id, ]$level_6_name <- entry$point_name\n", + " new_level_6[new_level_6$id == point_id, ]$level_3_id <- entry$polygon_id\n", + " new_level_6[new_level_6$id == point_id, ]$level_3_name <- entry$polygon_name \n", + " \n", + "}\n", + "new_level_6$level <- 6 # Reset the level\n", + "dim(new_level_6)\n", + "head(new_level_6, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "3a637054-13a5-4874-a3ea-b75dd192ab5e", + "metadata": {}, + "source": [ + "### Fixes for level 4 facilities that got their parent moved to level 6" + ] + }, + { + "cell_type": "markdown", + "id": "3e7791c5-2c2d-439b-8196-8cd35b566c4c", + "metadata": {}, + "source": [ + "Add the level 4 to the `new_pyramid` and update level 4 parents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d810d844-8d9e-43d2-b8b9-07ec1bc2faf2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Are there any org units at level 4 that had the parent at level 3 already moved?\n", + "group_prioritaires_level_4 <- group_prioritaires_table[group_prioritaires_table$level == 4, ]\n", + "\n", + "# Add level 4 to new pyramid (except org units to be moved to lvl 6)\n", + "new_pyramid_4 <- rbind(new_pyramid_3, pyramid_df[pyramid_df$level == 4 & !(pyramid_df$id %in% group_prioritaires_level_4$id), ])\n", + "\n", + "# Run check and fix\n", + "child_lvl_4 <- get_updated_children(\n", + " new_level_table=new_level_6,\n", + " group_table=group_prioritaires_level_4, \n", + " level = 4,\n", + " target_level=6,\n", + " parent_level=3\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "872d0295-bc4a-4809-93a7-a94c8a1030ec", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(child_lvl_4) == 0) {\n", + " log_msg(glue(\"There are no facilities at level 4 that needed an update to their parent reference.\"))\n", + "} else {\n", + " log_msg(glue(\"{nrow(child_lvl_4)} facilities had a parent reference updated.\")) \n", + "}\n", + "\n", + "# Move level 4 to level 6\n", + "group_prioritaires_level_4$level <- 6\n", + "group_prioritaires_level_4$level_6_id <- group_prioritaires_level_4$level_4_id\n", + "group_prioritaires_level_4$level_6_name <- group_prioritaires_level_4$level_4_name\n", + "group_prioritaires_level_4$level_4_id <- NA\n", + "group_prioritaires_level_4$level_4_name <- NA\n", + "\n", + "# update group 4\n", + "group_prioritaires_level_4_clean <- group_prioritaires_level_4[!group_prioritaires_level_4$id %in% child_lvl_4$id, ]\n", + "group_prioritaires_level_4_updated <- rbind(group_prioritaires_level_4_clean, child_lvl_4) \n", + "\n", + "new_level_6 <- rbind(new_level_6, group_prioritaires_level_4_updated) \n", + " \n", + "dim(new_level_6)\n", + "tail(new_level_6, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "b3b9936f-7ab2-4c4c-b8ed-6f648b24cf02", + "metadata": {}, + "source": [ + "### Run check for level 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5a025e7-9d4c-4e76-bfd5-fe2011f6641d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# select org units at level 5 to be moved -> 6\n", + "group_prioritaires_level_5 <- group_prioritaires_table[group_prioritaires_table$level == 5, ]\n", + "\n", + "# Add level 5 to new pyramid (except org units to be moved to lvl 6)\n", + "new_pyramid_5 <- rbind(new_pyramid_4, pyramid_df[pyramid_df$level == 5 & !(pyramid_df$id %in% group_prioritaires_level_5$id), ])\n", + "\n", + "# Run check and fix\n", + "child_lvl_5 <- get_updated_children(\n", + " new_level_table=new_level_6,\n", + " group_table=group_prioritaires_level_5, \n", + " level = 5,\n", + " target_level=6,\n", + " parent_level=3\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c5ece57-2373-43cb-8f93-0d83c0f916f0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dim(new_pyramid_5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "514f30e9-5a95-4835-8872-1d837a4d07ef", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(child_lvl_5) == 0) {\n", + " log_msg(glue(\"There are no facilities at level 5 that needed an update to their parent reference.\"))\n", + "} else {\n", + " log_msg(glue(\"{nrow(child_lvl_5)} facilities had a parent reference updated.\")) \n", + "}\n", + "\n", + "# Move level 5 to level 6\n", + "group_prioritaires_level_5$level <- 6\n", + "group_prioritaires_level_5$level_6_id <- group_prioritaires_level_5$level_5_id\n", + "group_prioritaires_level_5$level_6_name <- group_prioritaires_level_5$level_5_name\n", + "group_prioritaires_level_5$level_5_id <- NA\n", + "group_prioritaires_level_5$level_5_name <- NA\n", + "\n", + "# update group 5\n", + "group_prioritaires_level_5_clean <- group_prioritaires_level_5[!group_prioritaires_level_5$id %in% child_lvl_5$id, ]\n", + "group_prioritaires_level_5_updated <- rbind(group_prioritaires_level_5_clean, child_lvl_5) \n", + "\n", + "new_level_6 <- rbind(new_level_6, group_prioritaires_level_5_updated) \n", + " \n", + "dim(new_level_6)\n", + "tail(new_level_6, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "3c031cdd-1191-4c50-b9dd-aafe9a8d9c53", + "metadata": {}, + "source": [ + "### Select and add level 6 org units" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "385a933b-18b7-4239-b0eb-2ad21f1c3aad", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# select org units at level 5 to be moved -> 6\n", + "group_prioritaires_level_6 <- group_prioritaires_table[group_prioritaires_table$level == 6, ]\n", + "\n", + "# Add level 6 to new pyramid \n", + "# Comment this line if we want only to include the org units in the list\n", + "new_pyramid_6 <- rbind(new_pyramid_5, pyramid_df[pyramid_df$level == 6 & !(pyramid_df$id %in% group_prioritaires_level_6$id), ])\n", + "\n", + "new_level_6 <- rbind(new_level_6, group_prioritaires_level_6)\n", + "new_level_6 <- new_level_6[!duplicated(new_level_6$id), ]\n", + "\n", + "dim(new_level_6)\n", + "tail(new_level_6, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "d6e9fcbb-feca-47fd-a028-d9e0423b00c2", + "metadata": {}, + "source": [ + "### Create final pyramid" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57a1df59-3c18-4a52-af3f-32378db85e13", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add level 6 to new pyramid (based on the org units prioritaire list)\n", + "final_pyramid <- rbind(new_pyramid_6, new_level_6)\n", + "dim(final_pyramid)\n", + "tail(final_pyramid, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "025a736e-71a7-4f5a-afc7-2b204ecd0b15", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "for (level in unique(pyramid_df$level)) {\n", + " count <- nrow(pyramid_df[pyramid_df$level==level, ])\n", + " print(glue(\"Original pyramid OU at level {level}: {count}\"))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b6b8e2a-b70e-43e2-b0db-80fe9b8044a8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "for (level in unique(final_pyramid$level)) {\n", + " count <- nrow(final_pyramid[final_pyramid$level==level, ])\n", + " print(glue(\"New pyramid OU at level {level}: {count}\"))\n", + "} " + ] + }, + { + "cell_type": "markdown", + "id": "b2d7ae0e-7241-4753-b2c9-2a74ca0659f3", + "metadata": {}, + "source": [ + "### Save output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f28a7de-ee1e-418b-8d71-60bcd219ac48", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Write to parquet\n", + "log_msg(glue::glue(\"NER organisation units transformation: Saving transformed organisation units under {OUTPUT_PATH}.\"))\n", + "write_parquet(final_pyramid, file.path(OUTPUT_PATH))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_extract/reporting/snt_dhis2_extract_report.ipynb b/pipelines/snt_dhis2_extract/reporting/snt_dhis2_extract_report.ipynb index eb9f387..9afd77b 100644 --- a/pipelines/snt_dhis2_extract/reporting/snt_dhis2_extract_report.ipynb +++ b/pipelines/snt_dhis2_extract/reporting/snt_dhis2_extract_report.ipynb @@ -1,1232 +1,1202 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", - "metadata": {}, - "source": [ - "## **Extraction des données de routine**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "43794265-533f-4035-bf3d-975a3409507b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "## CONFIGURATION ##" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ced7513-0ee6-4b9b-ac07-124e510119af", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "FIGURES_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_extract\", \"reporting\", \"outputs\", \"figures\")\n", - "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e6b91b3-c196-4a1f-bc3d-a4bec5b90e51", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "# Configuration variables\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", - "indicator_defs <- config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADM_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "facility_level <- config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8edc2a5-07ce-4507-9939-4322fc510593", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25362e00-96b5-4200-be45-cdeeff9ce3ac", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# import analytics DHIS2 data\n", - "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_analytics.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 analytics file for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 organisation units data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "#reporting_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_reporting.parquet\")) }, \n", - "# error = function(e) {\n", - "# msg <- paste(\"Error loading \" , COUNTRY_CODE , \" DHIS2 reporting rates data : \" , \n", - "# paste0(COUNTRY_CODE, \"_dhis2_raw_reporting.parquet\"), \" can not be loaded.\")\n", - "# cat(msg)\n", - "# log_msg(msg, \"warning\") \n", - "# return(NULL)\n", - "# })\n", - "\n", - "printdim(routine_data)\n", - "printdim(pyramid_data)\n", - "#printdim(reporting_data)" - ] - }, - { - "cell_type": "markdown", - "id": "c3cee574-8d66-4cd5-8fe6-97f39daa158b", - "metadata": {}, - "source": [ - "### 1. Liste des éléments de donnée extraits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ef732f5-52a8-4abc-87ba-7ca77f6c85f2", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 1. Extract the list of categories and their DX codes\n", - "category_elements <- map(indicator_defs, ~ .x) # safely preserve all vectors\n", - "category_names <- names(category_elements)\n", - "\n", - "# 2. Get unique DX and DX_NAME from your main dataset\n", - "data_elements <- routine_data %>%\n", - " select(DX, DX_NAME) %>%\n", - " distinct()\n", - "\n", - "# 3. Build a lookup table assigning category to each DX\n", - "classified_elements <- bind_rows(lapply(category_names, function(cat) {\n", - " ids <- category_elements[[cat]]\n", - " data_elements %>%\n", - " filter(DX %in% ids) %>%\n", - " mutate(Categorie = cat)\n", - "}))\n", - "\n", - "# 4. Display results sorted\n", - "classified_elements %>%\n", - " arrange(Categorie, DX_NAME) %>%\n", - " kable(\n", - " caption = \"Liste des éléments de données extraits, classés par indicateur\",\n", - " col.names = c(\"ID de l'élément\", \"Nom de l'élément de donnée\", \"Indicateur\")\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fd12633d-8aa2-4ac9-91c4-e1c651031275", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Simple table of data elements and their disaggregations\n", - "disaggregations_table <- routine_data %>%\n", - " distinct(DX, DX_NAME, CO_NAME) %>%\n", - " group_by(DX, DX_NAME) %>%\n", - " summarise(\n", - " `Désagrégations` = paste(sort(unique(na.omit(CO_NAME))), collapse = \"; \"),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " mutate(`Désagrégations` = ifelse(`Désagrégations` == \"\", \"—\", `Désagrégations`)) %>%\n", - " arrange(DX_NAME)\n", - "\n", - "# Display\n", - "disaggregations_table" - ] - }, - { - "cell_type": "markdown", - "id": "35bcc286-cde1-47bd-99ab-3a6f6b39ac5d", - "metadata": {}, - "source": [ - "### 2. Période de couverture des données" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae59b2f0-0d97-4e04-a7ea-aa136b03cc68", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **Extraction des données de routine**" + ], + "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "## CONFIGURATION ##" + ], + "execution_count": null, + "outputs": [], + "id": "43794265-533f-4035-bf3d-975a3409507b" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_extract\")\n", + "FIGURES_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_extract\", \"reporting\", \"outputs\", \"figures\")\n", + "\n", + "# load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_extract.r\"))\n", + "\n", + "# List required packages \n", + "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"scales\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"knitr\", \"glue\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)\n", + "\n", + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ], + "execution_count": null, + "outputs": [], + "id": "2ced7513-0ee6-4b9b-ac07-124e510119af" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "# Configuration variables\n", + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", + "indicator_defs <- config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADM_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "facility_level <- config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL" + ], + "execution_count": null, + "outputs": [], + "id": "9e6b91b3-c196-4a1f-bc3d-a4bec5b90e51" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# printdim() loaded from code/snt_utils.r" + ], + "execution_count": null, + "outputs": [], + "id": "f8edc2a5-07ce-4507-9939-4322fc510593" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# import analytics DHIS2 data\n", + "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_analytics.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 analytics file for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 organisation units data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "#reporting_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_reporting.parquet\")) }, \n", + "# error = function(e) {\n", + "# msg <- paste(\"Error loading \" , COUNTRY_CODE , \" DHIS2 reporting rates data : \" , \n", + "# paste0(COUNTRY_CODE, \"_dhis2_raw_reporting.parquet\"), \" can not be loaded.\")\n", + "# cat(msg)\n", + "# log_msg(msg, \"warning\") \n", + "# return(NULL)\n", + "# })\n", + "\n", + "printdim(routine_data)\n", + "printdim(pyramid_data)\n", + "#printdim(reporting_data)" + ], + "execution_count": null, + "outputs": [], + "id": "25362e00-96b5-4200-be45-cdeeff9ce3ac" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Liste des éléments de donnée extraits" + ], + "id": "c3cee574-8d66-4cd5-8fe6-97f39daa158b" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 1. Extract the list of categories and their DX codes\n", + "category_elements <- map(indicator_defs, ~ .x) # safely preserve all vectors\n", + "category_names <- names(category_elements)\n", + "\n", + "# 2. Get unique DX and DX_NAME from your main dataset\n", + "data_elements <- routine_data %>%\n", + " select(DX, DX_NAME) %>%\n", + " distinct()\n", + "\n", + "# 3. Build a lookup table assigning category to each DX\n", + "classified_elements <- bind_rows(lapply(category_names, function(cat) {\n", + " ids <- category_elements[[cat]]\n", + " data_elements %>%\n", + " filter(DX %in% ids) %>%\n", + " mutate(Categorie = cat)\n", + "}))\n", + "\n", + "# 4. Display results sorted\n", + "classified_elements %>%\n", + " arrange(Categorie, DX_NAME) %>%\n", + " kable(\n", + " caption = \"Liste des éléments de données extraits, classés par indicateur\",\n", + " col.names = c(\"ID de l'élément\", \"Nom de l'élément de donnée\", \"Indicateur\")\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "5ef732f5-52a8-4abc-87ba-7ca77f6c85f2" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Simple table of data elements and their disaggregations\n", + "disaggregations_table <- routine_data %>%\n", + " distinct(DX, DX_NAME, CO_NAME) %>%\n", + " group_by(DX, DX_NAME) %>%\n", + " summarise(\n", + " `Désagrégations` = paste(sort(unique(na.omit(CO_NAME))), collapse = \"; \"),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " mutate(`Désagrégations` = ifelse(`Désagrégations` == \"\", \"—\", `Désagrégations`)) %>%\n", + " arrange(DX_NAME)\n", + "\n", + "# Display\n", + "disaggregations_table" + ], + "execution_count": null, + "outputs": [], + "id": "fd12633d-8aa2-4ac9-91c4-e1c651031275" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Période de couverture des données" + ], + "id": "35bcc286-cde1-47bd-99ab-3a6f6b39ac5d" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Mois minimum et maximum dans le jeu de données\n", + "cat(\"Premier mois pour lequel les données ont été extraites :\", min(routine_data$PE), \"\\n\")\n", + "cat(\"Dernier mois pour lequel les données ont été extraites :\", max(routine_data$PE), \"\\n\")\n", + "cat(\"Nombre total de mois couverts par les données :\", length(unique(routine_data$PE)), \"\\n\")\n", + "\n", + "# Vérification des mois manquants (en supposant des données mensuelles entre min et max)\n", + "all_months <- seq(ymd(paste0(min(routine_data$PE), \"01\")),\n", + " ymd(paste0(max(routine_data$PE), \"01\")),\n", + " by = \"1 month\") %>%\n", + " format(\"%Y%m\")" + ], + "execution_count": null, + "outputs": [], + "id": "ae59b2f0-0d97-4e04-a7ea-aa136b03cc68" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Résumé hierarchique" + ], + "id": "05f6938d-046b-4742-b8cb-3840a3646fb7" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Map NAME -> ID (robust if already *_ID)\n", + "adm1_id <- ifelse(str_ends(ADM_1, \"_ID\"), ADM_1, str_replace(ADM_1, \"_NAME$\", \"_ID\"))\n", + "adm2_id <- ifelse(str_ends(ADM_2, \"_ID\"), ADM_2, str_replace(ADM_2, \"_NAME$\", \"_ID\"))\n", + "\n", + "# Collect and order available LEVEL_*_ID columns\n", + "level_id_cols <- names(pyramid_data)[grepl(\"^LEVEL_\\\\d+_ID$\", names(pyramid_data))]\n", + "level_order <- as.integer(str_match(level_id_cols, \"^LEVEL_(\\\\d+)_ID$\")[,2])\n", + "level_id_cols <- level_id_cols[order(level_order)]\n", + "\n", + "# Build summary (counts of unique IDs per level)\n", + "level_summary <- tibble(Column = level_id_cols) %>%\n", + " mutate(\n", + " Level = as.integer(str_match(Column, \"^LEVEL_(\\\\d+)_ID$\")[,2]),\n", + " `Nombre d'unités` = map_int(Column, ~ n_distinct(pyramid_data[[.x]], na.rm = TRUE))\n", + " ) %>%\n", + " arrange(Level)\n", + "\n", + "# Add role labels using *_ID columns\n", + "level_summary <- level_summary %>%\n", + " mutate(\n", + " Rôle = case_when(\n", + " Column == adm1_id ~ \"ADM_1 (administration 1)\",\n", + " Column == adm2_id ~ \"ADM_2 (administration 2)\",\n", + " Level == facility_level ~ glue(\"Niveau des FOSA (L{facility_level})\"),\n", + " TRUE ~ \"\"\n", + " )\n", + " )\n", + "\n", + "# Pretty print\n", + "level_summary %>%\n", + " mutate(Niveau = paste0(\"L\", Level)) %>%\n", + " select(Niveau, Column, `Nombre d'unités`, Rôle) %>%\n", + " kable(caption = \"Résumé hiérarchique: nombre d’unités (IDs) uniques par niveau (pyramid_data)\")\n", + "\n", + "cat(glue(\n", + " \"\\nNote : ADM_1 est mappé sur `{ADM_1}` → `{adm1_id}`, ADM_2 sur `{ADM_2}` → `{adm2_id}`. \",\n", + " \"Le niveau opérationnel des formations sanitaires est L{facility_level}.\\n\"\n", + "))" + ], + "execution_count": null, + "outputs": [], + "id": "7533a147-6e04-4789-8f3a-e4687fb886b9" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Nombre et activité des formations sanitaires" + ], + "id": "c413e780-7a1a-4241-a06b-274e77d41b50" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Nombre total de formations sanitaires uniques selon le niveau organisationnel défini dans la pyramide\n", + "total_facilities <- pyramid_data %>% \n", + " pull(!!sym(paste0(\"LEVEL_\", facility_level, \"_ID\"))) %>%\n", + " unique() %>% \n", + " length()\n", + "\n", + "cat(glue::glue(\n", + " \"Les établissements sont identifiés de manière unique par leur identifiant d’unité organisationnelle issu de la pyramide, \",\n", + " \"c’est-à-dire le niveau {facility_level} de la hiérarchie sanitaire. \",\n", + " \"Au total, {total_facilities} formations sanitaires uniques ont été identifiées à ce niveau.\"\n", + "))" + ], + "execution_count": null, + "outputs": [], + "id": "239d062c-143d-4a7c-ac93-bee280e1d57a" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Vérification de l’activité : une formation sanitaire est considérée comme « active »\n", + "# si elle a rapporté au moins une valeur (y compris zéro) pendant la période spécifiée.\n", + "activity <- routine_data %>%\n", + " group_by(OU, PE) %>%\n", + " summarise(active = any(!is.na(VALUE)), .groups = \"drop\")\n", + "\n", + "# Nombre de formations sanitaires actives au moins une fois\n", + "active_facilities <- activity %>%\n", + " group_by(OU) %>%\n", + " summarise(active_ever = any(active), .groups = \"drop\") %>%\n", + " filter(active_ever) %>%\n", + " nrow()\n", + "\n", + "# Proportion d’établissements actifs\n", + "proportion_active <- 100 * active_facilities / total_facilities\n", + "\n", + "# Résumé des résultats (version enrichie)\n", + "period_start <- min(routine_data$PE)\n", + "period_end <- max(routine_data$PE)\n", + "\n", + "cat(glue(\n", + " \"Sur un total de {total_facilities} formations sanitaires uniques identifiées dans la pyramide, \",\n", + " \"{active_facilities} ont rapporté au moins une donnée sur un élément au cours de la période spécifiée \",\n", + " \"dans les données de routine ({period_start}–{period_end}), \",\n", + " \"soit {round(proportion_active, 1)} % d’établissements ayant effectivement transmis des données.\"\n", + "))" + ], + "execution_count": null, + "outputs": [], + "id": "4bc914d8-852d-4615-bcba-64bb2d33c56c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Years from routine (already fine)\n", + "yrs_rout <- sort(unique(as.integer(substr(routine_data$PE, 1, 4))))\n", + "years <- seq(min(yrs_rout, na.rm = TRUE), max(yrs_rout, na.rm = TRUE), by = 1)\n", + "\n", + "open_per_year <- bind_rows(lapply(years, open_in_year, df = pyramid_data)) %>%\n", + " mutate(Annee = as.integer(Annee))\n", + "\n", + "reported_per_year <- routine_data %>%\n", + " mutate(Annee = as.integer(substr(PE, 1, 4))) %>%\n", + " filter(Annee %in% years, !is.na(OU)) %>%\n", + " group_by(Annee, OU) %>%\n", + " summarise(any_value = any(!is.na(VALUE)), .groups = \"drop\") %>%\n", + " group_by(Annee) %>%\n", + " summarise(Ayant_rapporte_routine = sum(any_value, na.rm = TRUE), .groups = \"drop\")\n", + "\n", + "reconciliation <- open_per_year %>%\n", + " left_join(reported_per_year, by = \"Annee\") %>%\n", + " mutate(\n", + " Ayant_rapporte_routine = tidyr::replace_na(Ayant_rapporte_routine, 0L),\n", + " `Pct_rapportant_(%)` = dplyr::if_else(\n", + " Ouvertes_pyramide > 0,\n", + " round(100 * Ayant_rapporte_routine / Ouvertes_pyramide, 1),\n", + " NA_real_\n", + " )\n", + " ) %>%\n", + " arrange(Annee)\n", + "\n", + "# Updated text (no \"six dernières années\")\n", + "cat(glue(\n", + " \"L’activité structurelle des formations sanitaires est évaluée via les dates d’ouverture/fermeture de la pyramide. \",\n", + " \"Une formation est considérée ouverte pour une année si elle a été inaugurée avant/pendant cette année \",\n", + " \"et non fermée avant le 31/12. Le tableau présente, pour chaque année disponible dans l’extraction routine, \",\n", + " \"le nombre de formations ouvertes et celles ayant rapporté au moins une valeur.\"\n", + "))\n", + "\n", + "kable(reconciliation,\n", + " caption = \"Ouverture (pyramide) vs. rapportage effectif (routine), par année\")\n" + ], + "execution_count": null, + "outputs": [], + "id": "03179c36-7870-4a37-a075-44420d01a9c4" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# --- Make sure VALUE is treated as numeric where possible (silently)\n", + "routine_data <- routine_data %>%\n", + " mutate(VALUE = suppressWarnings(as.numeric(VALUE)))\n", + "\n", + "# ----- Build the fixed universes from routine_data only -----\n", + "# A) Universe over the whole period (ever reported anything)\n", + "active_ou_all <- routine_data %>%\n", + " group_by(OU) %>%\n", + " summarise(active_ever = any(!is.na(VALUE)), .groups = \"drop\") %>%\n", + " filter(active_ever) %>%\n", + " pull(OU)\n", + "\n", + "denom_all <- length(active_ou_all)\n", + "\n", + "# B) Universe per year (reported at least once within that year)\n", + "per_ou_pe <- routine_data %>%\n", + " group_by(OU, PE) %>%\n", + " summarise(any_value = any(!is.na(VALUE)), .groups = \"drop\") %>%\n", + " mutate(year = substr(PE, 1, 4))\n", + "\n", + "active_by_year <- per_ou_pe %>%\n", + " group_by(year, OU) %>%\n", + " summarise(active_year = any(any_value), .groups = \"drop\") %>%\n", + " filter(active_year) %>%\n", + " group_by(year) %>%\n", + " summarise(denom_year = n_distinct(OU), .groups = \"drop\")\n", + "\n", + "# ----- Monthly reporting using fixed universes -----\n", + "# A) Denominator = active over the whole period\n", + "monthly_reporting_all <- per_ou_pe %>%\n", + " filter(OU %in% active_ou_all) %>%\n", + " group_by(PE) %>%\n", + " summarise(\n", + " n_reporting = sum(any_value),\n", + " denom = denom_all,\n", + " pct_reporting = 100 * n_reporting / denom,\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " arrange(PE)\n", + "\n", + "# B) Denominator = active within the year\n", + "monthly_reporting_by_year <- per_ou_pe %>%\n", + " group_by(year, PE) %>%\n", + " summarise(n_reporting = sum(any_value), .groups = \"drop\") %>%\n", + " left_join(active_by_year, by = \"year\") %>%\n", + " mutate(pct_reporting = 100 * n_reporting / denom_year) %>%\n", + " arrange(PE) %>%\n", + " group_by(year) %>%\n", + " mutate(denom_line = first(denom_year)) %>%\n", + " ungroup() %>%\n", + " mutate(PE = factor(PE, levels = sort(unique(PE)))) # keep month order" + ], + "execution_count": null, + "outputs": [], + "id": "3c413a37-7edd-48b1-981c-c7e30661dee7" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "monthly_reporting_by_year <- monthly_reporting_by_year %>%\n", + " dplyr::group_by(year) %>%\n", + " dplyr::mutate(denom_line = dplyr::first(denom_year)) %>%\n", + " dplyr::ungroup() %>%\n", + " dplyr::mutate(PE = factor(PE, levels = sort(unique(PE)))) # keep month order\n", + "\n", + "# Avoid geom_line warning when a facet has only one month\n", + "line_ready <- monthly_reporting_by_year %>%\n", + " dplyr::add_count(year, name = \"n_months_year\") %>%\n", + " dplyr::filter(n_months_year > 1)\n", + "\n", + "options(repr.plot.width = 13, repr.plot.height = 8)\n", + "ggplot(monthly_reporting_by_year, aes(x = PE)) +\n", + " geom_line(\n", + " data = line_ready,\n", + " aes(y = n_reporting, color = \"Formations rapportant\", group = 1),\n", + " linewidth = 1\n", + " ) +\n", + " geom_point(aes(y = n_reporting, color = \"Formations rapportant\"), size = 1.2) +\n", + " geom_line(\n", + " data = line_ready,\n", + " aes(y = denom_line, color = \"Total actif dans l'année\", group = 1),\n", + " linewidth = 1,\n", + " linetype = \"dashed\"\n", + " ) +\n", + " facet_wrap(~ year, scales = \"free_x\") +\n", + " scale_color_manual(values = c(\n", + " \"Formations rapportant\" = \"steelblue\",\n", + " \"Total actif dans l'année\" = \"grey40\"\n", + " )) +\n", + " labs(\n", + " title = \"Évolution du nombre de formations sanitaires rapportant des données\",\n", + " subtitle = \"Ligne pointillée : total des formations sanitaires qui ont déclaré au moins une fois un élément de donnée au cours de l'année\",\n", + " x = NULL, y = \"Nombre de formations sanitaires\", color = NULL\n", + " ) +\n", + " theme_minimal(base_size = 13) +\n", + " theme(axis.text.x = element_text(angle = 45, hjust = 1))\n" + ], + "execution_count": null, + "outputs": [], + "id": "0f724464-b943-44a0-83e4-b500cf53d9db" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "options(repr.plot.width = 13, repr.plot.height = 8)\n", + "ggplot(monthly_reporting_by_year, aes(x = PE, y = pct_reporting)) +\n", + " geom_col(fill = \"darkgreen\", alpha = 0.8) +\n", + " facet_wrap(~ year, scales = \"free_x\") +\n", + " labs(\n", + " title = \"Proportion de formations sanitaires ayant rapporté au moins une valeur\",\n", + " subtitle = \"Par mois, avec dénominateur fixé à l'année de référence\",\n", + " x = NULL,\n", + " y = \"% des formations sanitaires\"\n", + " ) +\n", + " scale_y_continuous(limits = c(0, 100)) +\n", + " theme_minimal(base_size = 13) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 45, hjust = 1),\n", + " panel.grid.minor = element_blank()\n", + " )\n" + ], + "execution_count": null, + "outputs": [], + "id": "343b759e-ae3d-4201-a602-a3869acc4489" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "if (config_json$SNT_CONFIG$COUNTRY_CODE == \"NER\") {\n", + "\n", + " # --- Classify and count ---\n", + " fosa_counts <- pyramid_data %>%\n", + " mutate(fosa_type = norm_fosa_type(LEVEL_6_NAME)) %>%\n", + " count(fosa_type, sort = TRUE)\n", + "\n", + " # --- Add total row ---\n", + " fosa_counts <- fosa_counts %>%\n", + " add_row(fosa_type = \"Total\", n = sum(fosa_counts$n))\n", + "\n", + " total_l6 <- sum(fosa_counts$n[fosa_counts$fosa_type != \"Total\"])\n", + "\n", + " # --- Display summary table ---\n", + " knitr::kable(fosa_counts, caption = \"Répartition des formations sanitaires par type (niveau 6)\")\n", + "\n", + "} else {\n", + " cat(\"Cette section n'est pas applicable : la structure pyramidale diffère pour ce pays.\")\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "c0324990-c5c5-461a-ba79-6695717e0bce" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Complétude de l'extraction des données de routine au niveau des formations sanitaires\n", + "\n", + "Cette section présente la distribution des valeurs extraites pour chaque élément de donnée du SNIS, mois par mois, au niveau des formations sanitaires incluses dans la pyramide sanitaire.\n", + "\n", + "Pour chaque élément, trois situations sont distinguées :\n", + "- Valeur positive rapportée : au moins une valeur supérieure à zéro a été déclarée\n", + "- Valeur zéro rapportée : uniquement des valeurs égales à zéro ont été enregistrées\n", + "- Valeur manquante : aucune donnée n’a été rapportée pour le mois considéré\n", + "\n", + "Le nombre total de formations sanitaires reste constant, correspondant à celles ayant transmis au moins une donnée sur la période d’analyse.\n", + "\n", + "Les graphiques ci-dessous illustrent, pour chaque indicateur SNIS, la proportion relative de ces trois types de valeurs au fil du temps, permettant d’évaluer la complétude et la cohérence des données extraites avant tout traitement analytique." + ], + "id": "1be4d840-74a8-4a61-bf1a-af556ef270bc" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "options(jupyter.plot_mimetypes = c(\"image/png\"))" + ], + "execution_count": null, + "outputs": [], + "id": "3dec0947-f19a-40fe-8927-1ea0efdca904" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# --- 🚨 (NEW) STEP 1: *GP* sum up VALUEs of each INDICATOR (DX_NAME) by CO!! 🚨\n", + "routine_data <- routine_data %>%\n", + " group_by(OU, PE, DX_NAME) |> # DX_NAME == INDICATOR\n", + " summarise(VALUE = sum(as.numeric(VALUE)),\n", + " .groups = \"drop\") |>\n", + "mutate(INDICATOR = DX_NAME)" + ], + "execution_count": null, + "outputs": [], + "id": "b50e66f0-203c-44c4-9511-2532ea145980" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# --- STEP 2: Build expected full grid (OU × INDICATOR × DATE)\n", + "full_grid <- expand_grid(\n", + " OU = unique(routine_data$OU),\n", + " INDICATOR = unique(routine_data$INDICATOR),\n", + " PE = unique(routine_data$PE)\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "61d23bf9-14d1-4248-8232-bba1b95d837a" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# --- STEP 3: Join to detect missing / zero / positive\n", + "reporting_check <- full_grid %>%\n", + " left_join(\n", + " # data %>% select(OU, INDICATOR, DATE, VALUE),\n", + " routine_data %>% select(OU, INDICATOR, PE, VALUE),\n", + " # by = c(\"OU\", \"INDICATOR\", \"DATE\")\n", + " by = c(\"OU\", \"INDICATOR\", \"PE\")\n", + " ) %>%\n", + " mutate(\n", + " is_missing = is.na(VALUE),\n", + " is_zero = VALUE == 0 & !is.na(VALUE),\n", + " is_positive = VALUE > 0 & !is.na(VALUE)\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "fa0504b6-baf3-41d0-85b9-a8ffe6a88c11" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# --- STEP 4: Summarise by INDICATOR and date\n", + "reporting_summary <- reporting_check %>%\n", + " # group_by(INDICATOR, DATE) %>%\n", + " group_by(INDICATOR, PE) %>%\n", + " summarise(\n", + " n_total = n_distinct(OU),\n", + " n_missing = sum(is_missing),\n", + " n_zero = sum(is_zero),\n", + " n_positive = sum(is_positive),\n", + " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", + " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", + " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", + " # pct_total = sum(pct_missing, pct_zero, pct_positive), # sanity check: should be always == 100\n", + " .groups = \"drop\"\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "12bf998f-79da-4aa7-967e-ee50acaab000" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# --- STEP 5: Reshape for stacked plot\n", + "plot_data <- reporting_summary %>%\n", + " pivot_longer(\n", + " cols = starts_with(\"pct_\"),\n", + " names_to = \"Status\", values_to = \"Percentage\"\n", + " ) %>%\n", + " mutate(\n", + " Status = recode(Status,\n", + " pct_missing = \"Valeur manquante\",\n", + " pct_zero = \"Valeur 0 rapportée\", # old: \"Valeur nulle rapportée\",\n", + " pct_positive = \"Valeur positive rapportée\")\n", + " ) %>%\n", + " # complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n", + " complete(INDICATOR, PE, Status, fill = list(Percentage = 0))" + ], + "execution_count": null, + "outputs": [], + "id": "0504ad04-910e-4ec7-9448-397228baf541" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "plot_data <- plot_data %>%\n", + " left_join(classified_elements %>% distinct(DX_NAME, Categorie),\n", + " by = c(\"INDICATOR\" = \"DX_NAME\"),\n", + " relationship = \"many-to-many\")" + ], + "execution_count": null, + "outputs": [], + "id": "fcd8e259-80ff-4615-a568-9fe63ad0a9f3" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "categories <- plot_data %>%\n", + " filter(!is.na(Categorie)) %>%\n", + " distinct(Categorie) %>%\n", + " pull(Categorie)\n", + "\n", + "plots_by_category <- map(categories, function(cat) {\n", + " df_cat <- plot_data %>% filter(Categorie == cat)\n", + "\n", + " ggplot(df_cat,\n", + " aes(x = PE, y = Percentage, fill = Status)) +\n", + " geom_col(position = \"stack\") +\n", + " geom_hline(yintercept = c(25, 50, 75), color = \"white\", linewidth = 0.25) +\n", + " facet_wrap(~ INDICATOR, scales = \"free_y\", nrow = 1) +\n", + " scale_fill_manual(values = c(\n", + " \"Valeur manquante\" = \"tomato\",\n", + " \"Valeur 0 rapportée\" = \"skyblue\",\n", + " \"Valeur positive rapportée\" = \"green\"\n", + " )) +\n", + " labs(\n", + " title = paste(\"Distribution des valeurs extraites - Indicateur :\", cat),\n", + " subtitle = \"Proportion de formations sanitaires ayant rapporté des valeurs manquantes, nulles ou positives par mois\",\n", + " x = NULL,\n", + " y = \"% des formations sanitaires\",\n", + " fill = \"Type de valeur extraite\"\n", + " ) +\n", + " theme_minimal(base_size = 14) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 16),\n", + " strip.text = element_text(size = 10),\n", + " axis.title = element_text(size = 14),\n", + " axis.text = element_text(size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)\n", + " )\n", + "})\n" + ], + "execution_count": null, + "outputs": [], + "id": "8af82e0d-17cf-403e-a2f3-d76a90e915ea" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Example: show the first category plot\n", + "options(repr.plot.width = 15, repr.plot.height = 5)\n", + "walk(plots_by_category, print)" + ], + "execution_count": null, + "outputs": [], + "id": "c4b1db68-6ffe-4e6e-be81-30e615863fbf" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Disponibilité des données par formation sanitaire (sur la période analysée)" + ], + "id": "727c614b-c1fd-4c17-8414-1f6478f664c3" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cette section évalue la disponibilité des données de routine pour chaque formation sanitaire sur l’ensemble de la période analysée.\n", + "\n", + "- Pour chaque indicateur, le graphique montre le pourcentage de mois avec au moins une valeur non manquante (c’est-à-dire, une donnée rapportée, qu’elle soit nulle ou positive).\n", + "- Chaque ligne correspond à une formation sanitaire, et chaque colonne à un indicateur.\n", + "- Les couleurs vont du jaune (100 %), indiquant une disponibilité complète, au violet (0 %), indiquant une absence totale de données sur la période.\n", + "\n", + "Ce diagnostic permet d’identifier les formations sanitaires avec des problèmes chroniques de rapportage ou des interruptions prolongées dans la saisie des données." + ], + "id": "dbb43faa-444e-42ec-902f-34a44e3fa355" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# How many distinct months are in the analysis window?\n", + "n_months <- dplyr::n_distinct(routine_data$PE)\n", + "\n", + "# --- 1) Coverage by facility x indicator -------------------------------------\n", + "# Count the number of months with any non-missing VALUE (dedup PE if needed)\n", + "facility_cov <- routine_data %>%\n", + " dplyr::group_by(OU, DX_NAME, PE) %>%\n", + " dplyr::summarise(has_value = any(!is.na(VALUE)), .groups = \"drop\") %>% # 1 row per OU × DX × PE\n", + " dplyr::group_by(OU, DX_NAME) %>%\n", + " dplyr::summarise(\n", + " months_reported = sum(has_value), # months with data\n", + " pct_reported = 100 * months_reported / n_months,\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Optional: order facilities by overall completeness (across all indicators)\n", + "ou_order <- facility_cov %>%\n", + " dplyr::group_by(OU) %>%\n", + " dplyr::summarise(pct_overall = mean(pct_reported, na.rm = TRUE), .groups = \"drop\") %>%\n", + " dplyr::arrange(dplyr::desc(pct_overall)) %>%\n", + " dplyr::pull(OU)\n", + "\n", + "# Optional: order indicators (e.g., alphabetical, or use your custom order)\n", + "ind_order <- facility_cov %>%\n", + " dplyr::distinct(DX_NAME) %>%\n", + " dplyr::arrange(DX_NAME) %>%\n", + " dplyr::pull(DX_NAME)\n", + "\n", + "plot_df <- facility_cov %>%\n", + " dplyr::mutate(\n", + " OU = factor(OU, levels = ou_order),\n", + " DX_NAME = factor(DX_NAME, levels = ind_order)\n", + " )\n", + "\n", + "# --- 2) Heatmap ---------------------------------------------------------------\n", + "# Make the figure wide and tall so it remains readable\n", + "options(repr.plot.width = 15, repr.plot.height = 9)\n", + "\n", + "ggplot(plot_df, aes(x = OU, y = DX_NAME, fill = pct_reported)) +\n", + " geom_tile() +\n", + " scale_fill_viridis_c(name = \"% rapporté\", limits = c(0, 100)) +\n", + " labs(\n", + " title = \"Disponibilité des données par formation sanitaire (sur la période analysée)\",\n", + " subtitle = paste0(\"Pour chaque élément, % de mois avec une valeur non manquante • Fenêtre: \",\n", + " n_months, \" mois\"),\n", + " x = \"Formation sanitaire\",\n", + " y = \"Élément de données\"\n", + " ) +\n", + " theme_minimal(base_size = 14) +\n", + " theme(\n", + " axis.text.x = element_blank(), # trop nombreux\n", + " axis.ticks.x = element_blank(),\n", + " axis.text.y = element_text(size = 11),\n", + " plot.title = element_text(face = \"bold\", size = 16),\n", + " panel.grid = element_blank(),\n", + " legend.position = \"right\"\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "071be12f-8967-4580-bc2d-43dd07c1f855" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Tendances nationales et mensuelles par élément de données" + ], + "id": "7a5ed92b-f048-4467-8d3a-f86c6a4fe141" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cette section présente l’évolution temporelle des valeurs mensuelles totales pour chaque indicateur de paludisme au cours de la période analysée. Les courbes montrent la somme des valeurs rapportées à travers toutes les formations sanitaires et toutes les désagrégations.\n", + "- Chaque graphique correspond à un indicateur agrégé (par exemple, cas confirmés, cas présumés, décès, etc.).\n", + "- L’axe horizontal représente le temps (mois), et l’axe vertical le total des valeurs rapportées pour l’ensemble du pays.\n", + "\n", + "Ces tendances permettent de visualiser les fluctuations saisonnières et d’identifier d’éventuelles anomalies ou ruptures dans la dynamique des cas rapportés." + ], + "id": "6371c0e1-a5a7-423e-9df3-fc9eb8214bb6" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# routine data\n", + "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_analytics.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 analytics file for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })" + ], + "execution_count": null, + "outputs": [], + "id": "e56014d5-de3f-45a1-8a4f-a05f4ecc78d8" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Outlier removal removed: keep raw values as reported\n", + "routine_data <- routine_data %>%\n", + " mutate(VALUE = suppressWarnings(as.numeric(VALUE))) # ensure numeric\n" + ], + "execution_count": null, + "outputs": [], + "id": "582f4c5e-b650-4422-bc94-873fd1f0d399" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "### 1. Build mapping table (Categorie, DX, optional CO)\n", + "indicator_map <- imap_dfr(indicator_defs, function(dx_list, categorie) {\n", + " tibble(raw = dx_list) %>%\n", + " mutate(\n", + " DX = str_extract(raw, \"^[^\\\\.]+\"), # before first \".\"\n", + " CO = str_extract(raw, \"(?<=\\\\.).+\"), # after first \".\", if present\n", + " Categorie = categorie\n", + " ) %>%\n", + " select(Categorie, DX, CO)\n", + "})\n", + "\n", + "### 2. Clean routine data (fresh, with DX + CO columns present)\n", + "rd_clean <- routine_data %>%\n", + " mutate(\n", + " VALUE = suppressWarnings(as.numeric(VALUE)),\n", + " month = ymd(paste0(PE, \"01\"))\n", + " ) %>%\n", + " filter(!is.na(month), !is.na(DX))\n", + "\n", + "### 3. Split mapping into “DX only” vs “DX+CO”, then join\n", + "\n", + "# (a) Categories where CO is NOT specified: include all COs for that DX\n", + "map_dx_only <- indicator_map %>%\n", + " filter(is.na(CO)) %>%\n", + " distinct(Categorie, DX)\n", + "\n", + "joined_dx_only <- rd_clean %>%\n", + " inner_join(\n", + " map_dx_only,\n", + " by = \"DX\",\n", + " relationship = \"many-to-many\" # <-- silence warning, intentional\n", + " )\n", + "\n", + "# (b) Categories where CO IS specified: restrict to that exact DX+CO pair\n", + "map_dx_co <- indicator_map %>%\n", + " filter(!is.na(CO)) %>%\n", + " distinct(Categorie, DX, CO)\n", + "\n", + "joined_dx_co <- rd_clean %>%\n", + " inner_join(\n", + " map_dx_co,\n", + " by = c(\"DX\", \"CO\"),\n", + " relationship = \"many-to-many\" # <-- silence warning, intentional\n", + " )\n", + "\n", + "# (c) Combine\n", + "rd_cat <- bind_rows(joined_dx_only, joined_dx_co)\n", + "\n", + "# Ensure DX_NAME exists and create wrapped labels for long legend items\n", + "if (!\"DX_NAME\" %in% names(rd_cat)) {\n", + " rd_cat <- rd_cat %>%\n", + " left_join(\n", + " routine_data %>% distinct(DX, DX_NAME),\n", + " by = \"DX\"\n", + " )\n", + "}\n", + "\n", + "rd_cat <- rd_cat %>%\n", + " mutate(\n", + " DX_NAME = coalesce(DX_NAME, DX),\n", + " DX_LABEL = stringr::str_wrap(DX_NAME, width = 40)\n", + " )\n", + "\n", + "### 4. Monthly totals per category and per data element (DX_NAME)\n", + "monthly_by_dx <- rd_cat %>%\n", + " group_by(Categorie, DX_NAME, DX_LABEL, month) %>%\n", + " summarise(\n", + " total = sum(VALUE, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "### 5. Ensure full monthly sequence per (Categorie, DX_NAME)\n", + "monthly_by_dx_complete <- monthly_by_dx %>%\n", + " group_by(Categorie, DX_NAME, DX_LABEL) %>%\n", + " complete(\n", + " month = seq(min(month, na.rm = TRUE),\n", + " max(month, na.rm = TRUE),\n", + " by = \"1 month\"),\n", + " fill = list(total = 0)\n", + " ) %>%\n", + " ungroup()\n", + "\n", + "### 6. Build one plot per category\n", + "min_month <- min(ymd(paste0(routine_data$PE, \"01\")), na.rm = TRUE)\n", + "max_month <- max(ymd(paste0(routine_data$PE, \"01\")), na.rm = TRUE)\n", + "\n", + "plots_by_cat <- monthly_by_dx_complete %>%\n", + " split(.$Categorie) %>%\n", + " imap(function(df_cat, cat_name) {\n", + " has_multiple_months <- dplyr::n_distinct(df_cat$month) > 1\n", + "\n", + " p <- ggplot(\n", + " df_cat,\n", + " aes(x = month, y = total, color = DX_LABEL, group = DX_NAME)\n", + " )\n", + "\n", + " if (has_multiple_months) {\n", + " p <- p + geom_line(linewidth = 1.0, alpha = 0.9)\n", + " }\n", + "\n", + " p +\n", + " geom_point(size = 2.0, alpha = 0.95) +\n", + " \n", + " # Fixed x-axis for all categories\n", + " scale_x_date(\n", + " limits = c(min_month, max_month),\n", + " date_breaks = \"1 year\",\n", + " date_labels = \"%Y\"\n", + " ) +\n", + " \n", + " # Y-axis with safe formatting (avoid scale_cut bug)\n", + " scale_y_continuous(\n", + " labels = function(x) {\n", + " ifelse(x >= 1e6, paste0(round(x/1e6, 1), \"M\"),\n", + " ifelse(x >= 1e3, paste0(round(x/1e3, 1), \"K\"),\n", + " as.character(round(x))))\n", + " }\n", + " ) +\n", + " \n", + " labs(\n", + " title = paste0(cat_name, \" - series mensuelles par element de donnee\"),\n", + " subtitle = \"Somme des valeurs mensuelles (toutes UO et desagregations confondues)\",\n", + " x = \"Temps\",\n", + " y = \"Total national rapporte\",\n", + " color = \"Element de donnee\"\n", + " ) +\n", + " theme_minimal(base_size = 13) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 16),\n", + " plot.subtitle = element_text(size = 11, margin = margin(b = 8)),\n", + " axis.title.x = element_text(size = 12, face = \"bold\", margin = margin(t = 8)),\n", + " axis.title.y = element_text(size = 12, face = \"bold\", margin = margin(r = 8)),\n", + " axis.text.x = element_text(size = 10, angle = 45, hjust = 1, vjust = 1),\n", + " axis.text.y = element_text(size = 10),\n", + " legend.position = \"right\",\n", + " legend.title = element_text(size = 12, face = \"bold\"),\n", + " legend.text = element_text(size = 10),\n", + " legend.key.width = unit(1.2, \"lines\"),\n", + " legend.key.height = unit(0.95, \"lines\"),\n", + " panel.grid.minor = element_blank()\n", + " ) +\n", + " guides(color = guide_legend(ncol = 1, byrow = TRUE, override.aes = list(size = 3, alpha = 1)))\n", + " })" + ], + "execution_count": null, + "outputs": [], + "id": "1cd7f87b-3ea0-4c32-869d-79f73c9fd016" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Export each category plot with dynamic canvas to keep legends readable\n", + "if (!dir.exists(FIGURES_PATH)) {\n", + " dir.create(FIGURES_PATH, recursive = TRUE)\n", + "}\n", + "\n", + "options(repr.plot.width = 16, repr.plot.height = 8)\n", + "\n", + "purrr::iwalk(\n", + " plots_by_cat,\n", + " ~{\n", + " category_name <- .y\n", + " safe_name <- gsub(\"[^A-Za-z0-9]+\", \"_\", category_name)\n", + " \n", + " # Increase height when category has many legend items\n", + " n_items <- monthly_by_dx_complete %>%\n", + " filter(Categorie == category_name) %>%\n", + " distinct(DX_LABEL) %>%\n", + " nrow()\n", + " \n", + " fig_height <- max(8, min(18, 5 + (0.35 * n_items)))\n", + " file_name <- paste0(COUNTRY_CODE, \"_extract_trend_\", safe_name, \".png\")\n", + " \n", + " ggsave(\n", + " filename = file_name,\n", + " path = FIGURES_PATH,\n", + " plot = .x,\n", + " width = 18,\n", + " height = fig_height,\n", + " units = \"in\",\n", + " dpi = 300,\n", + " bg = \"white\"\n", + " )\n", + " \n", + " print(.x)\n", + " log_msg(glue::glue(\"Plot exporte: {file.path(FIGURES_PATH, file_name)}\"))\n", + " }\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "4b6b396d-91a3-4839-849b-f720b4ca61b5" } - }, - "outputs": [], - "source": [ - "# Mois minimum et maximum dans le jeu de données\n", - "cat(\"Premier mois pour lequel les données ont été extraites :\", min(routine_data$PE), \"\\n\")\n", - "cat(\"Dernier mois pour lequel les données ont été extraites :\", max(routine_data$PE), \"\\n\")\n", - "cat(\"Nombre total de mois couverts par les données :\", length(unique(routine_data$PE)), \"\\n\")\n", - "\n", - "# Vérification des mois manquants (en supposant des données mensuelles entre min et max)\n", - "all_months <- seq(ymd(paste0(min(routine_data$PE), \"01\")),\n", - " ymd(paste0(max(routine_data$PE), \"01\")),\n", - " by = \"1 month\") %>%\n", - " format(\"%Y%m\")" - ] - }, - { - "cell_type": "markdown", - "id": "05f6938d-046b-4742-b8cb-3840a3646fb7", - "metadata": {}, - "source": [ - "### 3. Résumé hierarchique" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7533a147-6e04-4789-8f3a-e4687fb886b9", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Map NAME -> ID (robust if already *_ID)\n", - "adm1_id <- ifelse(str_ends(ADM_1, \"_ID\"), ADM_1, str_replace(ADM_1, \"_NAME$\", \"_ID\"))\n", - "adm2_id <- ifelse(str_ends(ADM_2, \"_ID\"), ADM_2, str_replace(ADM_2, \"_NAME$\", \"_ID\"))\n", - "\n", - "# Collect and order available LEVEL_*_ID columns\n", - "level_id_cols <- names(pyramid_data)[grepl(\"^LEVEL_\\\\d+_ID$\", names(pyramid_data))]\n", - "level_order <- as.integer(str_match(level_id_cols, \"^LEVEL_(\\\\d+)_ID$\")[,2])\n", - "level_id_cols <- level_id_cols[order(level_order)]\n", - "\n", - "# Build summary (counts of unique IDs per level)\n", - "level_summary <- tibble(Column = level_id_cols) %>%\n", - " mutate(\n", - " Level = as.integer(str_match(Column, \"^LEVEL_(\\\\d+)_ID$\")[,2]),\n", - " `Nombre d'unités` = map_int(Column, ~ n_distinct(pyramid_data[[.x]], na.rm = TRUE))\n", - " ) %>%\n", - " arrange(Level)\n", - "\n", - "# Add role labels using *_ID columns\n", - "level_summary <- level_summary %>%\n", - " mutate(\n", - " Rôle = case_when(\n", - " Column == adm1_id ~ \"ADM_1 (administration 1)\",\n", - " Column == adm2_id ~ \"ADM_2 (administration 2)\",\n", - " Level == facility_level ~ glue(\"Niveau des FOSA (L{facility_level})\"),\n", - " TRUE ~ \"\"\n", - " )\n", - " )\n", - "\n", - "# Pretty print\n", - "level_summary %>%\n", - " mutate(Niveau = paste0(\"L\", Level)) %>%\n", - " select(Niveau, Column, `Nombre d'unités`, Rôle) %>%\n", - " kable(caption = \"Résumé hiérarchique: nombre d’unités (IDs) uniques par niveau (pyramid_data)\")\n", - "\n", - "cat(glue(\n", - " \"\\nNote : ADM_1 est mappé sur `{ADM_1}` → `{adm1_id}`, ADM_2 sur `{ADM_2}` → `{adm2_id}`. \",\n", - " \"Le niveau opérationnel des formations sanitaires est L{facility_level}.\\n\"\n", - "))" - ] - }, - { - "cell_type": "markdown", - "id": "c413e780-7a1a-4241-a06b-274e77d41b50", - "metadata": {}, - "source": [ - "### 4. Nombre et activité des formations sanitaires" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "239d062c-143d-4a7c-ac93-bee280e1d57a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Nombre total de formations sanitaires uniques selon le niveau organisationnel défini dans la pyramide\n", - "total_facilities <- pyramid_data %>% \n", - " pull(!!sym(paste0(\"LEVEL_\", facility_level, \"_ID\"))) %>%\n", - " unique() %>% \n", - " length()\n", - "\n", - "cat(glue::glue(\n", - " \"Les établissements sont identifiés de manière unique par leur identifiant d’unité organisationnelle issu de la pyramide, \",\n", - " \"c’est-à-dire le niveau {facility_level} de la hiérarchie sanitaire. \",\n", - " \"Au total, {total_facilities} formations sanitaires uniques ont été identifiées à ce niveau.\"\n", - "))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4bc914d8-852d-4615-bcba-64bb2d33c56c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Vérification de l’activité : une formation sanitaire est considérée comme « active »\n", - "# si elle a rapporté au moins une valeur (y compris zéro) pendant la période spécifiée.\n", - "activity <- routine_data %>%\n", - " group_by(OU, PE) %>%\n", - " summarise(active = any(!is.na(VALUE)), .groups = \"drop\")\n", - "\n", - "# Nombre de formations sanitaires actives au moins une fois\n", - "active_facilities <- activity %>%\n", - " group_by(OU) %>%\n", - " summarise(active_ever = any(active), .groups = \"drop\") %>%\n", - " filter(active_ever) %>%\n", - " nrow()\n", - "\n", - "# Proportion d’établissements actifs\n", - "proportion_active <- 100 * active_facilities / total_facilities\n", - "\n", - "# Résumé des résultats (version enrichie)\n", - "period_start <- min(routine_data$PE)\n", - "period_end <- max(routine_data$PE)\n", - "\n", - "cat(glue(\n", - " \"Sur un total de {total_facilities} formations sanitaires uniques identifiées dans la pyramide, \",\n", - " \"{active_facilities} ont rapporté au moins une donnée sur un élément au cours de la période spécifiée \",\n", - " \"dans les données de routine ({period_start}–{period_end}), \",\n", - " \"soit {round(proportion_active, 1)} % d’établissements ayant effectivement transmis des données.\"\n", - "))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03179c36-7870-4a37-a075-44420d01a9c4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Years from routine (already fine)\n", - "yrs_rout <- sort(unique(as.integer(substr(routine_data$PE, 1, 4))))\n", - "years <- seq(min(yrs_rout, na.rm = TRUE), max(yrs_rout, na.rm = TRUE), by = 1)\n", - "\n", - "# Helper: ensure Annee is integer\n", - "open_in_year <- function(df, y) {\n", - " y <- as.integer(y)\n", - " year_start <- as.Date(sprintf(\"%s-01-01\", y))\n", - " year_end <- as.Date(sprintf(\"%s-12-31\", y))\n", - " df %>%\n", - " filter(\n", - " as.Date(OPENING_DATE) <= year_end,\n", - " is.na(CLOSED_DATE) | as.Date(CLOSED_DATE) >= year_start\n", - " ) %>%\n", - " summarise(Annee = y, Ouvertes_pyramide = n(), .groups = \"drop\")\n", - "}\n", - "\n", - "open_per_year <- bind_rows(lapply(years, open_in_year, df = pyramid_data)) %>%\n", - " mutate(Annee = as.integer(Annee))\n", - "\n", - "reported_per_year <- routine_data %>%\n", - " mutate(Annee = as.integer(substr(PE, 1, 4))) %>%\n", - " filter(Annee %in% years, !is.na(OU)) %>%\n", - " group_by(Annee, OU) %>%\n", - " summarise(any_value = any(!is.na(VALUE)), .groups = \"drop\") %>%\n", - " group_by(Annee) %>%\n", - " summarise(Ayant_rapporte_routine = sum(any_value, na.rm = TRUE), .groups = \"drop\")\n", - "\n", - "reconciliation <- open_per_year %>%\n", - " left_join(reported_per_year, by = \"Annee\") %>%\n", - " mutate(\n", - " Ayant_rapporte_routine = tidyr::replace_na(Ayant_rapporte_routine, 0L),\n", - " `Pct_rapportant_(%)` = dplyr::if_else(\n", - " Ouvertes_pyramide > 0,\n", - " round(100 * Ayant_rapporte_routine / Ouvertes_pyramide, 1),\n", - " NA_real_\n", - " )\n", - " ) %>%\n", - " arrange(Annee)\n", - "\n", - "# Updated text (no \"six dernières années\")\n", - "cat(glue(\n", - " \"L’activité structurelle des formations sanitaires est évaluée via les dates d’ouverture/fermeture de la pyramide. \",\n", - " \"Une formation est considérée ouverte pour une année si elle a été inaugurée avant/pendant cette année \",\n", - " \"et non fermée avant le 31/12. Le tableau présente, pour chaque année disponible dans l’extraction routine, \",\n", - " \"le nombre de formations ouvertes et celles ayant rapporté au moins une valeur.\"\n", - "))\n", - "\n", - "kable(reconciliation,\n", - " caption = \"Ouverture (pyramide) vs. rapportage effectif (routine), par année\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c413a37-7edd-48b1-981c-c7e30661dee7", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- Make sure VALUE is treated as numeric where possible (silently)\n", - "routine_data <- routine_data %>%\n", - " mutate(VALUE = suppressWarnings(as.numeric(VALUE)))\n", - "\n", - "# ----- Build the fixed universes from routine_data only -----\n", - "# A) Universe over the whole period (ever reported anything)\n", - "active_ou_all <- routine_data %>%\n", - " group_by(OU) %>%\n", - " summarise(active_ever = any(!is.na(VALUE)), .groups = \"drop\") %>%\n", - " filter(active_ever) %>%\n", - " pull(OU)\n", - "\n", - "denom_all <- length(active_ou_all)\n", - "\n", - "# B) Universe per year (reported at least once within that year)\n", - "per_ou_pe <- routine_data %>%\n", - " group_by(OU, PE) %>%\n", - " summarise(any_value = any(!is.na(VALUE)), .groups = \"drop\") %>%\n", - " mutate(year = substr(PE, 1, 4))\n", - "\n", - "active_by_year <- per_ou_pe %>%\n", - " group_by(year, OU) %>%\n", - " summarise(active_year = any(any_value), .groups = \"drop\") %>%\n", - " filter(active_year) %>%\n", - " group_by(year) %>%\n", - " summarise(denom_year = n_distinct(OU), .groups = \"drop\")\n", - "\n", - "# ----- Monthly reporting using fixed universes -----\n", - "# A) Denominator = active over the whole period\n", - "monthly_reporting_all <- per_ou_pe %>%\n", - " filter(OU %in% active_ou_all) %>%\n", - " group_by(PE) %>%\n", - " summarise(\n", - " n_reporting = sum(any_value),\n", - " denom = denom_all,\n", - " pct_reporting = 100 * n_reporting / denom,\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " arrange(PE)\n", - "\n", - "# B) Denominator = active within the year\n", - "monthly_reporting_by_year <- per_ou_pe %>%\n", - " group_by(year, PE) %>%\n", - " summarise(n_reporting = sum(any_value), .groups = \"drop\") %>%\n", - " left_join(active_by_year, by = \"year\") %>%\n", - " mutate(pct_reporting = 100 * n_reporting / denom_year) %>%\n", - " arrange(PE) %>%\n", - " group_by(year) %>%\n", - " mutate(denom_line = first(denom_year)) %>%\n", - " ungroup() %>%\n", - " mutate(PE = factor(PE, levels = sort(unique(PE)))) # keep month order" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f724464-b943-44a0-83e4-b500cf53d9db", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "monthly_reporting_by_year <- monthly_reporting_by_year %>%\n", - " dplyr::group_by(year) %>%\n", - " dplyr::mutate(denom_line = dplyr::first(denom_year)) %>%\n", - " dplyr::ungroup() %>%\n", - " dplyr::mutate(PE = factor(PE, levels = sort(unique(PE)))) # keep month order\n", - "\n", - "# Avoid geom_line warning when a facet has only one month\n", - "line_ready <- monthly_reporting_by_year %>%\n", - " dplyr::add_count(year, name = \"n_months_year\") %>%\n", - " dplyr::filter(n_months_year > 1)\n", - "\n", - "options(repr.plot.width = 13, repr.plot.height = 8)\n", - "ggplot(monthly_reporting_by_year, aes(x = PE)) +\n", - " geom_line(\n", - " data = line_ready,\n", - " aes(y = n_reporting, color = \"Formations rapportant\", group = 1),\n", - " linewidth = 1\n", - " ) +\n", - " geom_point(aes(y = n_reporting, color = \"Formations rapportant\"), size = 1.2) +\n", - " geom_line(\n", - " data = line_ready,\n", - " aes(y = denom_line, color = \"Total actif dans l'année\", group = 1),\n", - " linewidth = 1,\n", - " linetype = \"dashed\"\n", - " ) +\n", - " facet_wrap(~ year, scales = \"free_x\") +\n", - " scale_color_manual(values = c(\n", - " \"Formations rapportant\" = \"steelblue\",\n", - " \"Total actif dans l'année\" = \"grey40\"\n", - " )) +\n", - " labs(\n", - " title = \"Évolution du nombre de formations sanitaires rapportant des données\",\n", - " subtitle = \"Ligne pointillée : total des formations sanitaires qui ont déclaré au moins une fois un élément de donnée au cours de l'année\",\n", - " x = NULL, y = \"Nombre de formations sanitaires\", color = NULL\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(axis.text.x = element_text(angle = 45, hjust = 1))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "343b759e-ae3d-4201-a602-a3869acc4489", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "options(repr.plot.width = 13, repr.plot.height = 8)\n", - "ggplot(monthly_reporting_by_year, aes(x = PE, y = pct_reporting)) +\n", - " geom_col(fill = \"darkgreen\", alpha = 0.8) +\n", - " facet_wrap(~ year, scales = \"free_x\") +\n", - " labs(\n", - " title = \"Proportion de formations sanitaires ayant rapporté au moins une valeur\",\n", - " subtitle = \"Par mois, avec dénominateur fixé à l'année de référence\",\n", - " x = NULL,\n", - " y = \"% des formations sanitaires\"\n", - " ) +\n", - " scale_y_continuous(limits = c(0, 100)) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 45, hjust = 1),\n", - " panel.grid.minor = element_blank()\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c0324990-c5c5-461a-ba79-6695717e0bce", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (config_json$SNT_CONFIG$COUNTRY_CODE == \"NER\") {\n", - "\n", - " # --- Helper to classify facility types (Niger-specific) ---\n", - " norm_fosa_type <- function(x){\n", - " x_up <- str_to_upper(str_squish(x))\n", - " case_when(\n", - " str_detect(x_up, \"^HD\\\\b\") ~ \"HD (hôpital de district)\",\n", - " str_detect(x_up, \"^CSI\\\\b\") ~ \"CSI (centre de santé intégré)\",\n", - " str_detect(x_up, \"^CS\\\\b\") ~ \"CS (case de santé)\",\n", - " str_detect(x_up, \"^(SS\\\\b|SALLE\\\\b|SALLE D'ACCOUCHEMENT\\\\b)\") ~ \"SS / Salle (soins/maternité)\",\n", - " str_detect(x_up, \"^(CLINIQUE|POLYCLINIQUE)\\\\b\") ~ \"Clinique (privé)\",\n", - " str_detect(x_up, \"^CABINET\\\\b\") ~ \"Cabinet (privé)\",\n", - " str_detect(x_up, \"^(INFIRMERIE|INFIRM)\\\\b\") ~ \"Infirmerie (privé)\",\n", - " str_detect(x_up, \"^CNSS\\\\b\") ~ \"CNSS\",\n", - " TRUE ~ \"Autre\"\n", - " )\n", - " }\n", - "\n", - " # --- Classify and count ---\n", - " fosa_counts <- pyramid_data %>%\n", - " mutate(fosa_type = norm_fosa_type(LEVEL_6_NAME)) %>%\n", - " count(fosa_type, sort = TRUE)\n", - "\n", - " # --- Add total row ---\n", - " fosa_counts <- fosa_counts %>%\n", - " add_row(fosa_type = \"Total\", n = sum(fosa_counts$n))\n", - "\n", - " total_l6 <- sum(fosa_counts$n[fosa_counts$fosa_type != \"Total\"])\n", - "\n", - " # --- Display summary table ---\n", - " knitr::kable(fosa_counts, caption = \"Répartition des formations sanitaires par type (niveau 6)\")\n", - "\n", - "} else {\n", - " cat(\"Cette section n'est pas applicable : la structure pyramidale diffère pour ce pays.\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "1be4d840-74a8-4a61-bf1a-af556ef270bc", - "metadata": {}, - "source": [ - "### 5. Complétude de l'extraction des données de routine au niveau des formations sanitaires\n", - "\n", - "Cette section présente la distribution des valeurs extraites pour chaque élément de donnée du SNIS, mois par mois, au niveau des formations sanitaires incluses dans la pyramide sanitaire.\n", - "\n", - "Pour chaque élément, trois situations sont distinguées :\n", - "- Valeur positive rapportée : au moins une valeur supérieure à zéro a été déclarée\n", - "- Valeur zéro rapportée : uniquement des valeurs égales à zéro ont été enregistrées\n", - "- Valeur manquante : aucune donnée n’a été rapportée pour le mois considéré\n", - "\n", - "Le nombre total de formations sanitaires reste constant, correspondant à celles ayant transmis au moins une donnée sur la période d’analyse.\n", - "\n", - "Les graphiques ci-dessous illustrent, pour chaque indicateur SNIS, la proportion relative de ces trois types de valeurs au fil du temps, permettant d’évaluer la complétude et la cohérence des données extraites avant tout traitement analytique." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3dec0947-f19a-40fe-8927-1ea0efdca904", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "options(jupyter.plot_mimetypes = c(\"image/png\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b50e66f0-203c-44c4-9511-2532ea145980", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- 🚨 (NEW) STEP 1: *GP* sum up VALUEs of each INDICATOR (DX_NAME) by CO!! 🚨\n", - "routine_data <- routine_data %>%\n", - " group_by(OU, PE, DX_NAME) |> # DX_NAME == INDICATOR\n", - " summarise(VALUE = sum(as.numeric(VALUE)),\n", - " .groups = \"drop\") |>\n", - "mutate(INDICATOR = DX_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61d23bf9-14d1-4248-8232-bba1b95d837a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- STEP 2: Build expected full grid (OU × INDICATOR × DATE)\n", - "full_grid <- expand_grid(\n", - " OU = unique(routine_data$OU),\n", - " INDICATOR = unique(routine_data$INDICATOR),\n", - " PE = unique(routine_data$PE)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa0504b6-baf3-41d0-85b9-a8ffe6a88c11", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- STEP 3: Join to detect missing / zero / positive\n", - "reporting_check <- full_grid %>%\n", - " left_join(\n", - " # data %>% select(OU, INDICATOR, DATE, VALUE),\n", - " routine_data %>% select(OU, INDICATOR, PE, VALUE),\n", - " # by = c(\"OU\", \"INDICATOR\", \"DATE\")\n", - " by = c(\"OU\", \"INDICATOR\", \"PE\")\n", - " ) %>%\n", - " mutate(\n", - " is_missing = is.na(VALUE),\n", - " is_zero = VALUE == 0 & !is.na(VALUE),\n", - " is_positive = VALUE > 0 & !is.na(VALUE)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12bf998f-79da-4aa7-967e-ee50acaab000", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- STEP 4: Summarise by INDICATOR and date\n", - "reporting_summary <- reporting_check %>%\n", - " # group_by(INDICATOR, DATE) %>%\n", - " group_by(INDICATOR, PE) %>%\n", - " summarise(\n", - " n_total = n_distinct(OU),\n", - " n_missing = sum(is_missing),\n", - " n_zero = sum(is_zero),\n", - " n_positive = sum(is_positive),\n", - " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", - " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", - " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", - " # pct_total = sum(pct_missing, pct_zero, pct_positive), # sanity check: should be always == 100\n", - " .groups = \"drop\"\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0504ad04-910e-4ec7-9448-397228baf541", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- STEP 5: Reshape for stacked plot\n", - "plot_data <- reporting_summary %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"Status\", values_to = \"Percentage\"\n", - " ) %>%\n", - " mutate(\n", - " Status = recode(Status,\n", - " pct_missing = \"Valeur manquante\",\n", - " pct_zero = \"Valeur 0 rapportée\", # old: \"Valeur nulle rapportée\",\n", - " pct_positive = \"Valeur positive rapportée\")\n", - " ) %>%\n", - " # complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n", - " complete(INDICATOR, PE, Status, fill = list(Percentage = 0))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fcd8e259-80ff-4615-a568-9fe63ad0a9f3", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "plot_data <- plot_data %>%\n", - " left_join(classified_elements %>% distinct(DX_NAME, Categorie),\n", - " by = c(\"INDICATOR\" = \"DX_NAME\"),\n", - " relationship = \"many-to-many\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8af82e0d-17cf-403e-a2f3-d76a90e915ea", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "categories <- plot_data %>%\n", - " filter(!is.na(Categorie)) %>%\n", - " distinct(Categorie) %>%\n", - " pull(Categorie)\n", - "\n", - "plots_by_category <- map(categories, function(cat) {\n", - " df_cat <- plot_data %>% filter(Categorie == cat)\n", - "\n", - " ggplot(df_cat,\n", - " aes(x = PE, y = Percentage, fill = Status)) +\n", - " geom_col(position = \"stack\") +\n", - " geom_hline(yintercept = c(25, 50, 75), color = \"white\", linewidth = 0.25) +\n", - " facet_wrap(~ INDICATOR, scales = \"free_y\", nrow = 1) +\n", - " scale_fill_manual(values = c(\n", - " \"Valeur manquante\" = \"tomato\",\n", - " \"Valeur 0 rapportée\" = \"skyblue\",\n", - " \"Valeur positive rapportée\" = \"green\"\n", - " )) +\n", - " labs(\n", - " title = paste(\"Distribution des valeurs extraites - Indicateur :\", cat),\n", - " subtitle = \"Proportion de formations sanitaires ayant rapporté des valeurs manquantes, nulles ou positives par mois\",\n", - " x = NULL,\n", - " y = \"% des formations sanitaires\",\n", - " fill = \"Type de valeur extraite\"\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 16),\n", - " strip.text = element_text(size = 10),\n", - " axis.title = element_text(size = 14),\n", - " axis.text = element_text(size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)\n", - " )\n", - "})\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4b1db68-6ffe-4e6e-be81-30e615863fbf", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Example: show the first category plot\n", - "options(repr.plot.width = 15, repr.plot.height = 5)\n", - "walk(plots_by_category, print)" - ] - }, - { - "cell_type": "markdown", - "id": "727c614b-c1fd-4c17-8414-1f6478f664c3", - "metadata": {}, - "source": [ - "### 6. Disponibilité des données par formation sanitaire (sur la période analysée)" - ] - }, - { - "cell_type": "markdown", - "id": "dbb43faa-444e-42ec-902f-34a44e3fa355", - "metadata": {}, - "source": [ - "Cette section évalue la disponibilité des données de routine pour chaque formation sanitaire sur l’ensemble de la période analysée.\n", - "\n", - "- Pour chaque indicateur, le graphique montre le pourcentage de mois avec au moins une valeur non manquante (c’est-à-dire, une donnée rapportée, qu’elle soit nulle ou positive).\n", - "- Chaque ligne correspond à une formation sanitaire, et chaque colonne à un indicateur.\n", - "- Les couleurs vont du jaune (100 %), indiquant une disponibilité complète, au violet (0 %), indiquant une absence totale de données sur la période.\n", - "\n", - "Ce diagnostic permet d’identifier les formations sanitaires avec des problèmes chroniques de rapportage ou des interruptions prolongées dans la saisie des données." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "071be12f-8967-4580-bc2d-43dd07c1f855", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# How many distinct months are in the analysis window?\n", - "n_months <- dplyr::n_distinct(routine_data$PE)\n", - "\n", - "# --- 1) Coverage by facility x indicator -------------------------------------\n", - "# Count the number of months with any non-missing VALUE (dedup PE if needed)\n", - "facility_cov <- routine_data %>%\n", - " dplyr::group_by(OU, DX_NAME, PE) %>%\n", - " dplyr::summarise(has_value = any(!is.na(VALUE)), .groups = \"drop\") %>% # 1 row per OU × DX × PE\n", - " dplyr::group_by(OU, DX_NAME) %>%\n", - " dplyr::summarise(\n", - " months_reported = sum(has_value), # months with data\n", - " pct_reported = 100 * months_reported / n_months,\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Optional: order facilities by overall completeness (across all indicators)\n", - "ou_order <- facility_cov %>%\n", - " dplyr::group_by(OU) %>%\n", - " dplyr::summarise(pct_overall = mean(pct_reported, na.rm = TRUE), .groups = \"drop\") %>%\n", - " dplyr::arrange(dplyr::desc(pct_overall)) %>%\n", - " dplyr::pull(OU)\n", - "\n", - "# Optional: order indicators (e.g., alphabetical, or use your custom order)\n", - "ind_order <- facility_cov %>%\n", - " dplyr::distinct(DX_NAME) %>%\n", - " dplyr::arrange(DX_NAME) %>%\n", - " dplyr::pull(DX_NAME)\n", - "\n", - "plot_df <- facility_cov %>%\n", - " dplyr::mutate(\n", - " OU = factor(OU, levels = ou_order),\n", - " DX_NAME = factor(DX_NAME, levels = ind_order)\n", - " )\n", - "\n", - "# --- 2) Heatmap ---------------------------------------------------------------\n", - "# Make the figure wide and tall so it remains readable\n", - "options(repr.plot.width = 15, repr.plot.height = 9)\n", - "\n", - "ggplot(plot_df, aes(x = OU, y = DX_NAME, fill = pct_reported)) +\n", - " geom_tile() +\n", - " scale_fill_viridis_c(name = \"% rapporté\", limits = c(0, 100)) +\n", - " labs(\n", - " title = \"Disponibilité des données par formation sanitaire (sur la période analysée)\",\n", - " subtitle = paste0(\"Pour chaque élément, % de mois avec une valeur non manquante • Fenêtre: \",\n", - " n_months, \" mois\"),\n", - " x = \"Formation sanitaire\",\n", - " y = \"Élément de données\"\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " axis.text.x = element_blank(), # trop nombreux\n", - " axis.ticks.x = element_blank(),\n", - " axis.text.y = element_text(size = 11),\n", - " plot.title = element_text(face = \"bold\", size = 16),\n", - " panel.grid = element_blank(),\n", - " legend.position = \"right\"\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "7a5ed92b-f048-4467-8d3a-f86c6a4fe141", - "metadata": {}, - "source": [ - "### 7. Tendances nationales et mensuelles par élément de données" - ] - }, - { - "cell_type": "markdown", - "id": "6371c0e1-a5a7-423e-9df3-fc9eb8214bb6", - "metadata": {}, - "source": [ - "Cette section présente l’évolution temporelle des valeurs mensuelles totales pour chaque indicateur de paludisme au cours de la période analysée. Les courbes montrent la somme des valeurs rapportées à travers toutes les formations sanitaires et toutes les désagrégations.\n", - "- Chaque graphique correspond à un indicateur agrégé (par exemple, cas confirmés, cas présumés, décès, etc.).\n", - "- L’axe horizontal représente le temps (mois), et l’axe vertical le total des valeurs rapportées pour l’ensemble du pays.\n", - "\n", - "Ces tendances permettent de visualiser les fluctuations saisonnières et d’identifier d’éventuelles anomalies ou ruptures dans la dynamique des cas rapportés." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e56014d5-de3f-45a1-8a4f-a05f4ecc78d8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# routine data\n", - "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_analytics.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 analytics file for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "582f4c5e-b650-4422-bc94-873fd1f0d399", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Outlier removal removed: keep raw values as reported\n", - "routine_data <- routine_data %>%\n", - " mutate(VALUE = suppressWarnings(as.numeric(VALUE))) # ensure numeric\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1cd7f87b-3ea0-4c32-869d-79f73c9fd016", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "### 1. Build mapping table (Categorie, DX, optional CO)\n", - "indicator_map <- imap_dfr(indicator_defs, function(dx_list, categorie) {\n", - " tibble(raw = dx_list) %>%\n", - " mutate(\n", - " DX = str_extract(raw, \"^[^\\\\.]+\"), # before first \".\"\n", - " CO = str_extract(raw, \"(?<=\\\\.).+\"), # after first \".\", if present\n", - " Categorie = categorie\n", - " ) %>%\n", - " select(Categorie, DX, CO)\n", - "})\n", - "\n", - "### 2. Clean routine data (fresh, with DX + CO columns present)\n", - "rd_clean <- routine_data %>%\n", - " mutate(\n", - " VALUE = suppressWarnings(as.numeric(VALUE)),\n", - " month = ymd(paste0(PE, \"01\"))\n", - " ) %>%\n", - " filter(!is.na(month), !is.na(DX))\n", - "\n", - "### 3. Split mapping into “DX only” vs “DX+CO”, then join\n", - "\n", - "# (a) Categories where CO is NOT specified: include all COs for that DX\n", - "map_dx_only <- indicator_map %>%\n", - " filter(is.na(CO)) %>%\n", - " distinct(Categorie, DX)\n", - "\n", - "joined_dx_only <- rd_clean %>%\n", - " inner_join(\n", - " map_dx_only,\n", - " by = \"DX\",\n", - " relationship = \"many-to-many\" # <-- silence warning, intentional\n", - " )\n", - "\n", - "# (b) Categories where CO IS specified: restrict to that exact DX+CO pair\n", - "map_dx_co <- indicator_map %>%\n", - " filter(!is.na(CO)) %>%\n", - " distinct(Categorie, DX, CO)\n", - "\n", - "joined_dx_co <- rd_clean %>%\n", - " inner_join(\n", - " map_dx_co,\n", - " by = c(\"DX\", \"CO\"),\n", - " relationship = \"many-to-many\" # <-- silence warning, intentional\n", - " )\n", - "\n", - "# (c) Combine\n", - "rd_cat <- bind_rows(joined_dx_only, joined_dx_co)\n", - "\n", - "# Ensure DX_NAME exists and create wrapped labels for long legend items\n", - "if (!\"DX_NAME\" %in% names(rd_cat)) {\n", - " rd_cat <- rd_cat %>%\n", - " left_join(\n", - " routine_data %>% distinct(DX, DX_NAME),\n", - " by = \"DX\"\n", - " )\n", - "}\n", - "\n", - "rd_cat <- rd_cat %>%\n", - " mutate(\n", - " DX_NAME = coalesce(DX_NAME, DX),\n", - " DX_LABEL = stringr::str_wrap(DX_NAME, width = 40)\n", - " )\n", - "\n", - "### 4. Monthly totals per category and per data element (DX_NAME)\n", - "monthly_by_dx <- rd_cat %>%\n", - " group_by(Categorie, DX_NAME, DX_LABEL, month) %>%\n", - " summarise(\n", - " total = sum(VALUE, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "### 5. Ensure full monthly sequence per (Categorie, DX_NAME)\n", - "monthly_by_dx_complete <- monthly_by_dx %>%\n", - " group_by(Categorie, DX_NAME, DX_LABEL) %>%\n", - " complete(\n", - " month = seq(min(month, na.rm = TRUE),\n", - " max(month, na.rm = TRUE),\n", - " by = \"1 month\"),\n", - " fill = list(total = 0)\n", - " ) %>%\n", - " ungroup()\n", - "\n", - "### 6. Build one plot per category\n", - "min_month <- min(ymd(paste0(routine_data$PE, \"01\")), na.rm = TRUE)\n", - "max_month <- max(ymd(paste0(routine_data$PE, \"01\")), na.rm = TRUE)\n", - "\n", - "plots_by_cat <- monthly_by_dx_complete %>%\n", - " split(.$Categorie) %>%\n", - " imap(function(df_cat, cat_name) {\n", - " has_multiple_months <- dplyr::n_distinct(df_cat$month) > 1\n", - "\n", - " p <- ggplot(\n", - " df_cat,\n", - " aes(x = month, y = total, color = DX_LABEL, group = DX_NAME)\n", - " )\n", - "\n", - " if (has_multiple_months) {\n", - " p <- p + geom_line(linewidth = 1.0, alpha = 0.9)\n", - " }\n", - "\n", - " p +\n", - " geom_point(size = 2.0, alpha = 0.95) +\n", - " \n", - " # Fixed x-axis for all categories\n", - " scale_x_date(\n", - " limits = c(min_month, max_month),\n", - " date_breaks = \"1 year\",\n", - " date_labels = \"%Y\"\n", - " ) +\n", - " \n", - " # Y-axis with safe formatting (avoid scale_cut bug)\n", - " scale_y_continuous(\n", - " labels = function(x) {\n", - " ifelse(x >= 1e6, paste0(round(x/1e6, 1), \"M\"),\n", - " ifelse(x >= 1e3, paste0(round(x/1e3, 1), \"K\"),\n", - " as.character(round(x))))\n", - " }\n", - " ) +\n", - " \n", - " labs(\n", - " title = paste0(cat_name, \" - series mensuelles par element de donnee\"),\n", - " subtitle = \"Somme des valeurs mensuelles (toutes UO et desagregations confondues)\",\n", - " x = \"Temps\",\n", - " y = \"Total national rapporte\",\n", - " color = \"Element de donnee\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 16),\n", - " plot.subtitle = element_text(size = 11, margin = margin(b = 8)),\n", - " axis.title.x = element_text(size = 12, face = \"bold\", margin = margin(t = 8)),\n", - " axis.title.y = element_text(size = 12, face = \"bold\", margin = margin(r = 8)),\n", - " axis.text.x = element_text(size = 10, angle = 45, hjust = 1, vjust = 1),\n", - " axis.text.y = element_text(size = 10),\n", - " legend.position = \"right\",\n", - " legend.title = element_text(size = 12, face = \"bold\"),\n", - " legend.text = element_text(size = 10),\n", - " legend.key.width = unit(1.2, \"lines\"),\n", - " legend.key.height = unit(0.95, \"lines\"),\n", - " panel.grid.minor = element_blank()\n", - " ) +\n", - " guides(color = guide_legend(ncol = 1, byrow = TRUE, override.aes = list(size = 3, alpha = 1)))\n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b6b396d-91a3-4839-849b-f720b4ca61b5", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "# Export each category plot with dynamic canvas to keep legends readable\n", - "if (!dir.exists(FIGURES_PATH)) {\n", - " dir.create(FIGURES_PATH, recursive = TRUE)\n", - "}\n", - "\n", - "options(repr.plot.width = 16, repr.plot.height = 8)\n", - "\n", - "purrr::iwalk(\n", - " plots_by_cat,\n", - " ~{\n", - " category_name <- .y\n", - " safe_name <- gsub(\"[^A-Za-z0-9]+\", \"_\", category_name)\n", - " \n", - " # Increase height when category has many legend items\n", - " n_items <- monthly_by_dx_complete %>%\n", - " filter(Categorie == category_name) %>%\n", - " distinct(DX_LABEL) %>%\n", - " nrow()\n", - " \n", - " fig_height <- max(8, min(18, 5 + (0.35 * n_items)))\n", - " file_name <- paste0(COUNTRY_CODE, \"_extract_trend_\", safe_name, \".png\")\n", - " \n", - " ggsave(\n", - " filename = file_name,\n", - " path = FIGURES_PATH,\n", - " plot = .x,\n", - " width = 18,\n", - " height = fig_height,\n", - " units = \"in\",\n", - " dpi = 300,\n", - " bg = \"white\"\n", - " )\n", - " \n", - " print(.x)\n", - " log_msg(glue::glue(\"Plot exporte: {file.path(FIGURES_PATH, file_name)}\"))\n", - " }\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r b/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r new file mode 100644 index 0000000..f72eb97 --- /dev/null +++ b/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r @@ -0,0 +1,73 @@ +# Shared helpers for snt_dhis2_extract notebooks. + +make_point_geojson <- function(lat, lon) { + sprintf('{"type": "Point", "coordinates": [%f, %f]}', lon, lat) +} + +apply_ner_manual_geometry_fixes <- function(group_prioritaires_table) { + manual_points <- data.frame( + id = c( + "xMqXanPgczy", + "sgO4yBg59SJ", + "oHRvIBeR5xH", + "TVaP0vBLvat", + "evMtQ7bLFYI", + "u3xCSh4hG9Q", + "P1oyCQT39rj" + ), + lat = c( + 14.212177799561589, + 13.485271755127068, + 13.551421362165923, + 13.509657990942971, + 13.586255600670649, + 13.509793678687808, + 13.535431049590938 + ), + lon = c( + 1.4625739941131144, + 7.143422105623865, + 2.116344191939423, + 2.1473435456528174, + 2.0918749136394097, + 2.147386518669057, + 2.09186651126039 + ), + stringsAsFactors = FALSE + ) + + for (i in seq_len(nrow(manual_points))) { + this_id <- manual_points$id[[i]] + group_prioritaires_table[group_prioritaires_table$id == this_id, ]$geometry <- + make_point_geojson(manual_points$lat[[i]], manual_points$lon[[i]]) + } + + group_prioritaires_table +} + +open_in_year <- function(df, y) { + y <- as.integer(y) + year_start <- as.Date(sprintf("%s-01-01", y)) + year_end <- as.Date(sprintf("%s-12-31", y)) + df %>% + dplyr::filter( + as.Date(OPENING_DATE) <= year_end, + is.na(CLOSED_DATE) | as.Date(CLOSED_DATE) >= year_start + ) %>% + dplyr::summarise(Annee = y, Ouvertes_pyramide = dplyr::n(), .groups = "drop") +} + +norm_fosa_type <- function(x) { + x_up <- stringr::str_to_upper(stringr::str_squish(x)) + dplyr::case_when( + stringr::str_detect(x_up, "^HD\\b") ~ "HD (hôpital de district)", + stringr::str_detect(x_up, "^CSI\\b") ~ "CSI (centre de santé intégré)", + stringr::str_detect(x_up, "^CS\\b") ~ "CS (case de santé)", + stringr::str_detect(x_up, "^(SS\\b|SALLE\\b|SALLE D'ACCOUCHEMENT\\b)") ~ "SS / Salle (soins/maternité)", + stringr::str_detect(x_up, "^(CLINIQUE|POLYCLINIQUE)\\b") ~ "Clinique (privé)", + stringr::str_detect(x_up, "^CABINET\\b") ~ "Cabinet (privé)", + stringr::str_detect(x_up, "^(INFIRMERIE|INFIRM)\\b") ~ "Infirmerie (privé)", + stringr::str_detect(x_up, "^CNSS\\b") ~ "CNSS", + TRUE ~ "Autre" + ) +} diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb index 8fdf547..e4d5980 100644 --- a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb @@ -1,1394 +1,1369 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "47551f88-b40b-449f-9dc1-59db71183611", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 💡 Comments / Questions & To Do's:\n", + "# - filter by YEAR keep only 2022-2024): \n", + "# 1. Why these years? Arbitrary choice? Based on what? linked to what?\n", + "# 2. Is this a paramater is some other pipeline? if so, should be integrated here somehow \n", + "# - Missing data: why do we have NA values for population? Are these real NA (missing data) or 0?\n", + "# - OUTLIERS: there are clear outliers (i.e., DS AGADEZ): shall we do some simple data cleaning here?\n", + "# - Population catagories (breaks) do we have a specific scale in mind \n", + "# (i.e., use same as another country) or can I set it based on the data" + ], + "execution_count": null, + "outputs": [], + "id": "47551f88-b40b-449f-9dc1-59db71183611" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# TO DO / FINISH:\n", + "# - add safety \"if\" logic so nb does not fail if data is missing or wrong path ...\n", + "# - (maybe) also add meaningful messages\n", + "# - Add code to export PNG files of relevant figures\n", + "# - Set dynamic boundaries for POPULATION categories? (so can use same code in different countries)\n", + "# - Clean code to avoid redundancies (especially ggplot stuff, a lot of copy pasted ...)" + ], + "execution_count": null, + "outputs": [], + "id": "342b6b54-4812-4b07-b408-68a034b4014e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Paths and Config" + ], + "id": "5b72f828-4fc1-462d-babc-f8f6c9c96ff5" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_formatting\")\n", + "\n", + "REPORTING_NB_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_formatting/reporting\")\n", + "\n", + "# Create output directories if they don't exist (before loading utils)\n", + "figures_dir <- file.path(REPORTING_NB_PATH, \"outputs\", \"figures\")\n", + "if (!dir.exists(figures_dir)) {\n", + " dir.create(figures_dir, recursive = TRUE)\n", + " print(paste0(\"Created figures directory: \", figures_dir))\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "7d3285c7-1a60-46ad-9541-36a703d51924" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_formatting_report.r\"))" + ], + "execution_count": null, + "outputs": [], + "id": "732733e7-8890-4c3e-be64-496fd4a2c800" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "required_packages <- c(\n", + " \"tidyverse\", \n", + " \"arrow\", \n", + " \"sf\", \n", + " \"reticulate\",\n", + " \"patchwork\"\n", + ") \n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ], + "execution_count": null, + "outputs": [], + "id": "3f26728d-10a0-42d6-a7ff-368cc38e60b9" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load openhexa.sdk\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ], + "execution_count": null, + "outputs": [], + "id": "20475dd9-5091-4f87-9ae2-d0235921fe94" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })" + ], + "execution_count": null, + "outputs": [], + "id": "9f70d726-1c34-47dc-b963-bb23e42994bb" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Configuration variables\n", + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", + "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" + ], + "execution_count": null, + "outputs": [], + "id": "90d58c60-fb4e-40e4-add8-5f258f541843" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "4b96fa16-25cc-4420-9ad8-332af4a59fdf" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# printdim() loaded from code/snt_utils.r" + ], + "execution_count": null, + "outputs": [], + "id": "8eece9e0-2544-48c1-8579-a5a721af4ff8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Import data" + ], + "id": "643abe28-da3b-4bd2-9ecc-126b18b85c69" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# import analytics DHIS2 data\n", + "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste0(\"[WARNING] Error while loading DHIS2 Routine data for: \" , COUNTRY_CODE, \n", + " \" the report cannot be executed. [ERROR DETAILS] \", conditionMessage(e))\n", + " stop(msg)\n", + " })\n", + "\n", + "printdim(routine_data)" + ], + "execution_count": null, + "outputs": [], + "id": "43bbbcdf-c1d1-4631-980c-2c4465cf7a55" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste0(COUNTRY_NAME , \" Population data is not available in dataset : \" , dataset_name, \" last version.\")\n", + " log_msg(msg, \"warning\")\n", + " population_data <- NULL\n", + " })\n", + "\n", + "printdim(population_data)" + ], + "execution_count": null, + "outputs": [], + "id": "d53274c5-965e-4a11-bb77-c9b899d5cb9c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) { \n", + " msg <- paste0(COUNTRY_NAME , \" Shapes data is not available in dataset : \" , dataset_name, \" last version.\")\n", + " log_msg(msg, \"warning\")\n", + " shapes_data <- NULL\n", + " })\n", + "\n", + "printdim(shapes_data)" + ], + "execution_count": null, + "outputs": [], + "id": "c1be5372-cbc1-4343-ab11-01eae0fa9d60" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [], + "id": "c881f748-e391-46c9-a36a-ed11c238a6ce" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "65ea60f5-99e9-46d1-89f0-03245d9efd0b" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Complétude des indicateurs composites**\n" + ], + "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Complétude du rapportage des indicateurs composites / Reporting Completeness of Composite Indicators" + ], + "id": "ca84fce1-0407-433a-a98a-e65ed15ab8de" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "head(routine_data)" + ], + "execution_count": null, + "outputs": [], + "id": "c7691e61-6542-4d40-af2a-c018d29b86a8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.1 Proportion de formations sanitaires ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur" + ], + "id": "c109e82d-8c72-41f0-857a-322163cf213e" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Step 0: Rename your data for convenience\n", + "data <- routine_data\n", + "\n", + "# Step 1: Convert PERIOD to DATE\n", + "data <- data %>%\n", + " mutate(\n", + " DATE = ymd(paste0(PERIOD, \"01\"))\n", + " )\n", + "\n", + "# Step 2: Reshape wide to long: INDICATOR = column name (e.g., CONF), VALUE = value\n", + "indicator_vars <- setdiff(names(data), c(\n", + " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"DATE\"\n", + "))\n", + "\n", + "long_data <- data %>%\n", + " pivot_longer(cols = all_of(indicator_vars),\n", + " names_to = \"INDICATOR\",\n", + " values_to = \"VALUE\") %>%\n", + " rename(OU = OU_ID)\n", + "\n", + "# Step 3: Build expected full grid (OU × INDICATOR × DATE)\n", + "full_grid <- expand_grid(\n", + " OU = unique(long_data$OU),\n", + " INDICATOR = unique(long_data$INDICATOR),\n", + " DATE = unique(long_data$DATE)\n", + ")\n", + "\n", + "# Step 4: Join and assess reporting status\n", + "reporting_check <- full_grid %>%\n", + " left_join(\n", + " long_data %>% select(OU, INDICATOR, DATE, VALUE),\n", + " by = c(\"OU\", \"INDICATOR\", \"DATE\")\n", + " ) %>%\n", + " mutate(\n", + " is_missing = is.na(VALUE),\n", + " is_zero = VALUE == 0 & !is.na(VALUE),\n", + " is_positive = VALUE > 0 & !is.na(VALUE)\n", + " )\n", + "\n", + "# Step 5: Summarise reporting status\n", + "reporting_summary <- reporting_check %>%\n", + " group_by(INDICATOR, DATE) %>%\n", + " summarise(\n", + " n_total = n_distinct(OU),\n", + " n_missing = sum(is_missing),\n", + " n_zero = sum(is_zero),\n", + " n_positive = sum(is_positive),\n", + " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", + " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", + " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 6: Prepare plot-ready data\n", + "plot_data <- reporting_summary %>%\n", + " pivot_longer(\n", + " cols = starts_with(\"pct_\"),\n", + " names_to = \"Status\",\n", + " values_to = \"Percentage\"\n", + " ) %>%\n", + " mutate(\n", + " Status = recode(Status,\n", + " pct_missing = \"Valeur manquante\",\n", + " pct_zero = \"Valeur nulle rapportée\",\n", + " pct_positive = \"Valeur positive rapportée\")\n", + " ) %>%\n", + " complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n" + ], + "execution_count": null, + "outputs": [], + "id": "0f54505a-2dcc-429e-a900-46d4fae6fd31" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "options(repr.plot.width = 17, repr.plot.height = 10)\n", + "ggplot(plot_data, aes(x = DATE, y = Percentage, fill = Status)) +\n", + " geom_col(position = \"stack\") +\n", + " facet_wrap(~ INDICATOR, scales = \"free_y\", ncol = 4) +\n", + " scale_y_continuous() +\n", + " scale_fill_manual(values = c(\n", + " \"Valeur manquante\" = \"tomato\",\n", + " \"Valeur nulle rapportée\" = \"skyblue\",\n", + " \"Valeur positive rapportée\" = \"green\"\n", + " )) +\n", + " labs(\n", + " title = \"Taux de rapportage par indicateur (niveau formation sanitaire)\",\n", + " subtitle = \"Proportion des valeurs rapportées par mois et par indicateur\",\n", + " x = \"Mois\", y = \"% des formations sanitaires\",\n", + " fill = \"Statut du rapportage\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " strip.text = element_text(size = 16),\n", + " axis.title = element_text(size = 16),\n", + " axis.text = element_text(size = 16)\n", + " )\n" + ], + "execution_count": null, + "outputs": [], + "id": "cfd115e7-176d-4beb-9ab9-2e6990cb16af" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.2 Proportion des districts ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur." + ], + "id": "e6871759-714b-437a-8b9c-5a5a06656567" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Step 0: Rename for convenience\n", + "data <- routine_data\n", + "\n", + "# Step 1: Convert PERIOD to proper Date\n", + "data <- data %>%\n", + " mutate(Date = ymd(paste0(PERIOD, \"01\")))\n", + "\n", + "# Step 2: Identify indicator columns\n", + "indicator_cols <- setdiff(names(data), c(\n", + " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\",\n", + " \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"Date\"\n", + "))\n", + "\n", + "# Step 3: Reshape to long format\n", + "data_long <- data %>%\n", + " select(ADM2_ID, OU_ID, Date, all_of(indicator_cols)) %>%\n", + " pivot_longer(cols = all_of(indicator_cols),\n", + " names_to = \"Indicator\", values_to = \"value\") %>%\n", + " mutate(value = as.numeric(value))\n", + "\n", + "# Step 4: Full expected grid at ADM2 level\n", + "full_grid <- expand_grid(\n", + " ADM2_ID = unique(data_long$ADM2_ID),\n", + " Indicator = unique(data_long$Indicator),\n", + " Date = unique(data_long$Date)\n", + ")\n", + "\n", + "# Step 5: Detect if *any* health facility reported per district × indicator × date\n", + "reporting_check <- data_long %>%\n", + " group_by(ADM2_ID, Indicator, Date) %>%\n", + " summarise(\n", + " is_missing = all(is.na(value)),\n", + " is_zero = all(value == 0, na.rm = TRUE),\n", + " is_positive = any(value > 0, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 6: Join with full grid to fill in missing ADM2s\n", + "reporting_full <- full_grid %>%\n", + " left_join(reporting_check, by = c(\"ADM2_ID\", \"Indicator\", \"Date\")) %>%\n", + " mutate(\n", + " is_missing = replace_na(is_missing, TRUE),\n", + " is_zero = replace_na(is_zero, FALSE),\n", + " is_positive = replace_na(is_positive, FALSE)\n", + " )\n", + "\n", + "# Step 7: Summarise by Indicator and Date\n", + "reporting_summary <- reporting_full %>%\n", + " group_by(Indicator, Date) %>%\n", + " summarise(\n", + " n_total = n_distinct(ADM2_ID),\n", + " n_missing = sum(is_missing),\n", + " n_zero = sum(is_zero & !is_missing),\n", + " n_positive = sum(is_positive),\n", + " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", + " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", + " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 8: Reshape for plotting\n", + "plot_data <- reporting_summary %>%\n", + " pivot_longer(cols = starts_with(\"pct_\"),\n", + " names_to = \"Status\", values_to = \"Percentage\") %>%\n", + " mutate(Status = recode(Status,\n", + " pct_missing = \"Valeur manquante\",\n", + " pct_zero = \"Valeur nulle rapportée\",\n", + " pct_positive = \"Valeur positive rapportée\")) %>%\n", + " complete(Indicator, Date, Status, fill = list(Percentage = 0))\n", + "\n", + "# Step 9: Plot\n", + "ggplot(plot_data, aes(x = Date, y = Percentage, fill = Status)) +\n", + " geom_col(position = \"stack\") +\n", + " facet_wrap(~ Indicator, scales = \"free_y\") +\n", + " scale_y_continuous(limits = c(0, 100)) +\n", + " scale_fill_manual(values = c(\n", + " \"Valeur manquante\" = \"tomato\",\n", + " \"Valeur nulle rapportée\" = \"skyblue\",\n", + " \"Valeur positive rapportée\" = \"green\"\n", + " )) +\n", + " labs(\n", + " title = \"Taux de rapportage par indicateur (niveau district)\",\n", + " subtitle = \"Proportion des districts (ADM2_ID) rapportant chaque mois\",\n", + " x = \"Mois\", y = \"% des districts\",\n", + " fill = \"Statut du rapportage\"\n", + " ) +\n", + " theme_minimal(base_size = 14) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 18),\n", + " strip.text = element_text(size = 14),\n", + " axis.title = element_text(size = 14),\n", + " axis.text = element_text(size = 12)\n", + " )\n" + ], + "execution_count": null, + "outputs": [], + "id": "c89f6c77-dd42-4616-8eb5-1642d5b51157" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Cohérence interne des indicateurs composites" + ], + "id": "5cda3985" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.1 Filtrage préliminaire des valeurs aberrantes pour l’analyse de cohérence\n", + "\n", + "Avant d’évaluer la cohérence entre les indicateurs composites, nous éliminons d’abord les valeurs aberrantes les plus extrêmes. Cette étape ne modifie pas définitivement le jeu de données et ne vise pas à détecter toutes les valeurs aberrantes ; elle permet simplement d’exclure les cas extrêmes afin de faciliter une évaluation plus fiable de la cohérence entre les indicateurs." + ], + "id": "c131a633" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# detect_mad_outliers() loaded from utils/snt_dhis2_formatting_report.r" + ], + "execution_count": null, + "outputs": [], + "id": "936268f4" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Step 0: Select relevant core indicators\n", + "target_indicators <- c(\"SUSP\", \"TEST\", \"CONF\", \"MALTREAT\", \"PRES\")\n", + "\n", + "# Step 1: Convert wide to long format\n", + "routine_long <- routine_data %>%\n", + " pivot_longer(\n", + " cols = all_of(target_indicators),\n", + " names_to = \"indicator\",\n", + " values_to = \"value\"\n", + " ) %>%\n", + " mutate(\n", + " PERIOD = as.character(PERIOD), # Ensure PERIOD is character for join\n", + " OU = OU_ID # Alias for join clarity\n", + " )\n", + "\n", + "# Step 2: Filter to indicators of interest\n", + "routine_long_filtered <- routine_long %>%\n", + " filter(indicator %in% target_indicators)\n", + "\n", + "# Step 3: Calculate MAD15\n", + "mad15_data <- detect_mad_outliers(\n", + " routine_long_filtered,\n", + " deviation = 15,\n", + " outlier_column = \"mad15\"\n", + ")\n", + "\n", + "# Step 4: Calculate MAD10 (only where mad15 not flagged or missing)\n", + "mad10_flags <- mad15_data %>%\n", + " filter(is.na(mad15) | mad15 == FALSE, !is.na(value)) %>%\n", + " detect_mad_outliers(deviation = 10, outlier_column = \"mad10\")\n", + "\n", + "# Step 5: Combine MAD15 and MAD10 results\n", + "mad_combined <- mad15_data %>%\n", + " left_join(\n", + " mad10_flags %>% select(PERIOD, OU, indicator, mad10),\n", + " by = c(\"PERIOD\", \"OU\", \"indicator\")\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "881f9625" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Step 6: Identify outliers (MAD15 or MAD10 flagged as TRUE)\n", + "outlier_flags <- mad_combined %>%\n", + " filter(mad15 == TRUE | mad10 == TRUE) %>%\n", + " mutate(PERIOD = as.numeric(PERIOD)) %>%\n", + " select(PERIOD, OU, indicator)\n", + "\n", + "# Step 7: Reshape routine_data to long format for filtering\n", + "routine_long_all <- routine_data %>%\n", + " pivot_longer(\n", + " cols = all_of(target_indicators),\n", + " names_to = \"indicator\",\n", + " values_to = \"value\"\n", + " ) %>%\n", + " mutate(OU = OU_ID)\n", + "\n", + "# Step 8: Remove outliers\n", + "routine_long_clean <- routine_long_all %>%\n", + " anti_join(outlier_flags, by = c(\"PERIOD\", \"OU\", \"indicator\"))\n", + "\n", + "# Step 9: Reshape back to wide format if needed\n", + "routine_data_clean <- routine_long_clean %>%\n", + " select(-OU) %>%\n", + " pivot_wider(names_from = indicator, values_from = value)\n" + ], + "execution_count": null, + "outputs": [], + "id": "04d41ed1" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.2 Cohérence des indicateurs" + ], + "id": "c6a5a77b" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Step 1: Extract year and month from PERIOD\n", + "routine_hd_month <- routine_data_clean %>%\n", + " mutate(\n", + " YEAR = substr(PERIOD, 1, 4),\n", + " MONTH = substr(PERIOD, 5, 6)\n", + " ) %>%\n", + " group_by(ADM2_ID, YEAR, MONTH) %>%\n", + " summarise(\n", + " SUSP = sum(SUSP, na.rm = TRUE),\n", + " TEST = sum(TEST, na.rm = TRUE),\n", + " CONF = sum(CONF, na.rm = TRUE),\n", + " MALTREAT = sum(MALTREAT, na.rm = TRUE),\n", + " PRES = sum(PRES, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 2: Create scatter plots\n", + "options(repr.plot.width = 14, repr.plot.height = 6)\n", + "\n", + "p1 <- ggplot(routine_hd_month, aes(x = SUSP, y = TEST)) +\n", + " geom_point(alpha = 0.5, color = \"blue\") +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " labs(title = \"Suspectés vs Testés\", x = \"Cas suspectés\", y = \"Cas testés\") +\n", + " theme_minimal(base_size = 16)\n", + "\n", + "p2 <- ggplot(routine_hd_month, aes(x = TEST, y = CONF)) +\n", + " geom_point(alpha = 0.5, color = \"darkgreen\") +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " labs(title = \"Testés vs Confirmés\", x = \"Cas testés\", y = \"Cas confirmés\") +\n", + " theme_minimal(base_size = 16)\n", + "\n", + "p3 <- ggplot(routine_hd_month, aes(x = CONF, y = MALTREAT)) +\n", + " geom_point(alpha = 0.5, color = \"purple\") +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " labs(title = \"Confirmés vs Traités\", x = \"Cas confirmés\", y = \"Cas traités\") +\n", + " theme_minimal(base_size = 16)\n", + "\n", + "# Step 3: Combine plots\n", + "(p1 | p2 | p3) + plot_layout(guides = \"collect\")\n" + ], + "execution_count": null, + "outputs": [], + "id": "6cfeb18e" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Step 1: Aggregate monthly values\n", + "rds_clean_month <- routine_data_clean %>%\n", + " mutate(\n", + " YEAR = substr(PERIOD, 1, 4),\n", + " MONTH = substr(PERIOD, 5, 6),\n", + " DATE = as.Date(paste(YEAR, MONTH, \"01\", sep = \"-\"))\n", + " ) %>%\n", + " group_by(YEAR, MONTH, DATE) %>%\n", + " summarise(\n", + " SUSP = sum(SUSP, na.rm = TRUE),\n", + " TEST = sum(TEST, na.rm = TRUE),\n", + " CONF = sum(CONF, na.rm = TRUE),\n", + " PRES = sum(PRES, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 2: Plot monthly national trends\n", + "options(repr.plot.width = 14, repr.plot.height = 6)\n", + "rds_clean_month %>%\n", + " pivot_longer(cols = c(SUSP, TEST, CONF, PRES), names_to = \"Indicator\") %>%\n", + " ggplot(aes(x = DATE, y = value, color = Indicator)) +\n", + " geom_line(linewidth = 1.2) +\n", + " labs(\n", + " title = \"Tendances mensuelles nationales des indicateurs composites (après suppression des outliers)\",\n", + " x = \"Mois\", y = \"Nombre de cas\", color = \"Indicateur\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " axis.title = element_text(size = 16),\n", + " axis.text = element_text(size = 16),\n", + " legend.title = element_text(size = 16),\n", + " legend.text = element_text(size = 16)\n", + " )\n" + ], + "execution_count": null, + "outputs": [], + "id": "0df24272" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Carte des populations par district sanitaire (DS)" + ], + "id": "780fc9f8-6c67-4328-85f1-6bdefcd15b48" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.1. Carte de la Population pour ADM2 " + ], + "id": "da58bbd3" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Code from previous version of the notebook\n", + "# Uses continuos scale for population\n", + "\n", + "# Run if population_data is available\n", + "if (!is.null(population_data) & !is.null(shapes_data)) {\n", + " # Join population to spatial shapes\n", + " map_data <- shapes_data %>%\n", + " left_join(population_data, by = \"ADM2_ID\")\n", + " \n", + " # Plot population per district (DS)\n", + " plot <- ggplot(map_data) +\n", + " geom_sf(aes(fill = POPULATION), color = \"white\", size = 0.2) +\n", + " scale_fill_viridis_c(option = \"C\", name = \"Population\") +\n", + " labs(\n", + " title = \"Population totale par district sanitaire (DS)\",\n", + " subtitle = \"Données DHIS2\",\n", + " caption = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " theme_minimal(base_size = 14) \n", + "\n", + " print(plot)\n", + "\n", + "} else {\n", + " print(\"Population or shapes data not available.\")\n", + "}\n" + ], + "execution_count": null, + "outputs": [], + "id": "6965155d" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ⚠️ 3.2. Carte de la Population Désagrégée (spécifique au pays)\n", + "Le code suivant est spécifique à chaque pays et repose sur une population désagrégée. " + ], + "id": "eb276692" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "population_data_filtered <- population_data\n", + "if (COUNTRY_CODE == \"NER\") {\n", + " print(\"🇳🇪 Executing NER specific code ... \")\n", + "\n", + " IRdisplay::display_markdown(\"\n", + " ### 🇳🇪 NER specific code \n", + " Made ad hoc to allow comparison with data from other or previous analyses. Namely:\n", + " * only year 2022 to 2024\n", + " * specific palette (yellowish to brick red)\n", + " * specific intervals\n", + " * looks at **disaggregated** population <- this is sometimes contry-specific!\n", + "\")\n", + "\n", + " # --- Filter data to keep only 2022-2024 ... ---\n", + " years_to_keep <- 2022:2024\n", + " population_data_filtered <- population_data |> filter(YEAR %in% years_to_keep)\n", + "\n", + " # --- Read data from SNT_metadata.json ---\n", + " metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading metadata\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + " # --- Assign population breaks from metadata ---\n", + " value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)\n", + " value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)\n", + " value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)\n", + "\n", + " # --- Create dynamic labels based on breaks ---\n", + " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", + " labels_u5 <- create_dynamic_labels(value_breaks_u5)\n", + " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", + "\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "4d33724e" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "NER_palette_population <- c(\n", + " \"1\" = \"#fae6db\",\n", + " \"2\" = \"#f1b195\",\n", + " \"3\" = \"#ea7354\",\n", + " \"4\" = \"#cc3f32\",\n", + " \"5\" = \"#972620\"\n", + ")\n" + ], + "execution_count": null, + "outputs": [], + "id": "0fdb96a0-873d-4f85-9c34-23c89c204c30" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Population Totales" + ], + "id": "95892df7-e5b8-4d7a-bf96-88673e633370" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "if (COUNTRY_CODE == \"NER\") {\n", + "\n", + " # IMPORTNAT: palette vector MUST be RENAMED with the (dynamic) descriptive labels\n", + "names(NER_palette_population) <- labels_tot\n", + "\n", + "plot <- population_data_filtered %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION,\n", + " breaks = c(0, value_breaks_tot, Inf),\n", + " labels = labels_tot, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population totale par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_tot, \n", + " drop = FALSE \n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(10, 0, 0, 0)\n", + " )\n", + "\n", + "print(plot)\n", + "\n", + "# Export to see better in high resolution\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_totals.png\")),\n", + " width = 14,\n", + " height = 8,\n", + " dpi = 300\n", + ")\n", + "}\n" + ], + "execution_count": null, + "outputs": [], + "id": "a0a196b8-2db5-478d-899a-48985d1735f0" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Population Femmes Enceintes (FE)" + ], + "id": "aca477aa-4d93-4a74-ad8c-32a30f85a552" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "if (COUNTRY_CODE == \"NER\") {\n", + "\n", + "names(NER_palette_population) <- labels_fe\n", + "\n", + "plot <- population_data_filtered %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION_FE,\n", + " breaks = c(0, value_breaks_fe, Inf),\n", + " labels = labels_fe, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_fe, \n", + " drop = FALSE # Prevents dropping empty levels from legend\n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(10, 0, 0, 0)\n", + " )\n", + "\n", + "print(plot)\n", + "\n", + "# Export to see better in high resolution\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_fe.png\")),\n", + " width = 14, \n", + " height = 8,\n", + " dpi = 300\n", + ")\n", + "\n", + "}\n" + ], + "execution_count": null, + "outputs": [], + "id": "9324a56b" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Population Enfants moins de 5 ans (U5)" + ], + "id": "bd5fe86d-591a-4f5a-bc42-58180a413d5d" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "if (COUNTRY_CODE == \"NER\") {\n", + "\n", + "names(NER_palette_population) <- labels_u5\n", + "\n", + "plot <- population_data_filtered %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION_U5,\n", + " breaks = c(0, value_breaks_u5, Inf),\n", + " labels = labels_u5, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_u5, \n", + " drop = FALSE \n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(10, 0, 0, 0)\n", + " )\n", + "\n", + "print(plot)\n", + "\n", + "# Export PNG\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_u5.png\")),\n", + " width = 14, \n", + " height = 8,\n", + " dpi = 300\n", + ")\n", + "\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "4046761f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.2. Complétude et qualité des données de la Population" + ], + "id": "61e5ac12-c973-48e0-8c97-1af90e4b59a5" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Population Totale" + ], + "id": "0d86ed4a-e194-496b-9440-ad206157ee17" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# hist(population_data$POPULATION)\n", + "hist(population_data_filtered$POPULATION)" + ], + "execution_count": null, + "outputs": [], + "id": "bec2759d-9ac4-42e1-9f7e-7076780bd7d6" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "ggplot(population_data_filtered) +\n", + " geom_point(aes(x = POPULATION,\n", + " y = fct_reorder(ADM2_NAME, POPULATION),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+05, 4e+05, 6e+05, 8e+05, 1e+06, 1.5e+06),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "# Export PNG\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", \"hist_population_totale.png\"),\n", + " units = \"cm\",\n", + " width = 15,\n", + " height = 23,\n", + " bg = \"white\"\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "bc00527f-d8f9-4c9e-bf4a-326c92cf8a68" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Population Femmes Enceintes (FE)" + ], + "id": "d6ab387a-cc9e-42b9-a634-12af21bef0f5" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Wrap in if statement to avoid errors if POPULATION_FE is missing\n", + "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", + " hist(population_data_filtered$POPULATION_FE)\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "c6bb79dd-2d8a-4cd1-bf91-3e0e48c14eda" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", + " \n", + "ggplot(population_data_filtered) +\n", + " geom_point(aes(x = POPULATION_FE,\n", + " y = fct_reorder(ADM2_NAME, POPULATION_FE),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+05, 1e+06, 1.5e+06),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(\n", + " # title = \"\"\n", + " color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "} " + ], + "execution_count": null, + "outputs": [], + "id": "4200afa2-e2f0-4876-9842-141b96f32fe8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Population Enfants moins de 5 ans (U5)" + ], + "id": "e39305c0-3700-48c3-967a-b9c6af3e737f" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", + " hist(population_data_filtered$POPULATION_U5)\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "bbda9b88-9b91-4845-83a8-795a12124999" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", + "\n", + "ggplot(population_data_filtered) +\n", + " geom_point(aes(x = POPULATION_U5,\n", + " y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+04, 1e+05, 1.5e+05),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(\n", + " # title = \"\"\n", + " color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "742116ab-fef7-46ea-8c4b-0aa2a166005d" } - }, - "outputs": [], - "source": [ - "# 💡 Comments / Questions & To Do's:\n", - "# - filter by YEAR keep only 2022-2024): \n", - "# 1. Why these years? Arbitrary choice? Based on what? linked to what?\n", - "# 2. Is this a paramater is some other pipeline? if so, should be integrated here somehow \n", - "# - Missing data: why do we have NA values for population? Are these real NA (missing data) or 0?\n", - "# - OUTLIERS: there are clear outliers (i.e., DS AGADEZ): shall we do some simple data cleaning here?\n", - "# - Population catagories (breaks) do we have a specific scale in mind \n", - "# (i.e., use same as another country) or can I set it based on the data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "342b6b54-4812-4b07-b408-68a034b4014e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# TO DO / FINISH:\n", - "# - add safety \"if\" logic so nb does not fail if data is missing or wrong path ...\n", - "# - (maybe) also add meaningful messages\n", - "# - Add code to export PNG files of relevant figures\n", - "# - Set dynamic boundaries for POPULATION categories? (so can use same code in different countries)\n", - "# - Clean code to avoid redundancies (especially ggplot stuff, a lot of copy pasted ...)" - ] - }, - { - "cell_type": "markdown", - "id": "5b72f828-4fc1-462d-babc-f8f6c9c96ff5", - "metadata": {}, - "source": [ - "## 0. Paths and Config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d3285c7-1a60-46ad-9541-36a703d51924", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "\n", - "REPORTING_NB_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_formatting/reporting\")\n", - "\n", - "# Create output directories if they don't exist (before loading utils)\n", - "figures_dir <- file.path(REPORTING_NB_PATH, \"outputs\", \"figures\")\n", - "if (!dir.exists(figures_dir)) {\n", - " dir.create(figures_dir, recursive = TRUE)\n", - " print(paste0(\"Created figures directory: \", figures_dir))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "732733e7-8890-4c3e-be64-496fd4a2c800", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3f26728d-10a0-42d6-a7ff-368cc38e60b9", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "required_packages <- c(\n", - " \"tidyverse\", \n", - " \"arrow\", \n", - " \"sf\", \n", - " \"reticulate\",\n", - " \"patchwork\"\n", - ") \n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20475dd9-5091-4f87-9ae2-d0235921fe94", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load openhexa.sdk\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f70d726-1c34-47dc-b963-bb23e42994bb", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90d58c60-fb4e-40e4-add8-5f258f541843", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration variables\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", - "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b96fa16-25cc-4420-9ad8-332af4a59fdf", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8eece9e0-2544-48c1-8579-a5a721af4ff8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "643abe28-da3b-4bd2-9ecc-126b18b85c69", - "metadata": {}, - "source": [ - "## 1. Import data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "43bbbcdf-c1d1-4631-980c-2c4465cf7a55", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# import analytics DHIS2 data\n", - "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste0(\"[WARNING] Error while loading DHIS2 Routine data for: \" , COUNTRY_CODE, \n", - " \" the report cannot be executed. [ERROR DETAILS] \", conditionMessage(e))\n", - " stop(msg)\n", - " })\n", - "\n", - "printdim(routine_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d53274c5-965e-4a11-bb77-c9b899d5cb9c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste0(COUNTRY_NAME , \" Population data is not available in dataset : \" , dataset_name, \" last version.\")\n", - " log_msg(msg, \"warning\")\n", - " population_data <- NULL\n", - " })\n", - "\n", - "printdim(population_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1be5372-cbc1-4343-ab11-01eae0fa9d60", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) { \n", - " msg <- paste0(COUNTRY_NAME , \" Shapes data is not available in dataset : \" , dataset_name, \" last version.\")\n", - " log_msg(msg, \"warning\")\n", - " shapes_data <- NULL\n", - " })\n", - "\n", - "printdim(shapes_data)" - ] - }, - { - "cell_type": "markdown", - "id": "c881f748-e391-46c9-a36a-ed11c238a6ce", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65ea60f5-99e9-46d1-89f0-03245d9efd0b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", - "metadata": {}, - "source": [ - "# **Complétude des indicateurs composites**\n" - ] - }, - { - "cell_type": "markdown", - "id": "ca84fce1-0407-433a-a98a-e65ed15ab8de", - "metadata": {}, - "source": [ - "# 1. Complétude du rapportage des indicateurs composites / Reporting Completeness of Composite Indicators" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7691e61-6542-4d40-af2a-c018d29b86a8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(routine_data)" - ] - }, - { - "cell_type": "markdown", - "id": "c109e82d-8c72-41f0-857a-322163cf213e", - "metadata": {}, - "source": [ - "## 1.1 Proportion de formations sanitaires ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f54505a-2dcc-429e-a900-46d4fae6fd31", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 0: Rename your data for convenience\n", - "data <- routine_data\n", - "\n", - "# Step 1: Convert PERIOD to DATE\n", - "data <- data %>%\n", - " mutate(\n", - " DATE = ymd(paste0(PERIOD, \"01\"))\n", - " )\n", - "\n", - "# Step 2: Reshape wide to long: INDICATOR = column name (e.g., CONF), VALUE = value\n", - "indicator_vars <- setdiff(names(data), c(\n", - " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"DATE\"\n", - "))\n", - "\n", - "long_data <- data %>%\n", - " pivot_longer(cols = all_of(indicator_vars),\n", - " names_to = \"INDICATOR\",\n", - " values_to = \"VALUE\") %>%\n", - " rename(OU = OU_ID)\n", - "\n", - "# Step 3: Build expected full grid (OU × INDICATOR × DATE)\n", - "full_grid <- expand_grid(\n", - " OU = unique(long_data$OU),\n", - " INDICATOR = unique(long_data$INDICATOR),\n", - " DATE = unique(long_data$DATE)\n", - ")\n", - "\n", - "# Step 4: Join and assess reporting status\n", - "reporting_check <- full_grid %>%\n", - " left_join(\n", - " long_data %>% select(OU, INDICATOR, DATE, VALUE),\n", - " by = c(\"OU\", \"INDICATOR\", \"DATE\")\n", - " ) %>%\n", - " mutate(\n", - " is_missing = is.na(VALUE),\n", - " is_zero = VALUE == 0 & !is.na(VALUE),\n", - " is_positive = VALUE > 0 & !is.na(VALUE)\n", - " )\n", - "\n", - "# Step 5: Summarise reporting status\n", - "reporting_summary <- reporting_check %>%\n", - " group_by(INDICATOR, DATE) %>%\n", - " summarise(\n", - " n_total = n_distinct(OU),\n", - " n_missing = sum(is_missing),\n", - " n_zero = sum(is_zero),\n", - " n_positive = sum(is_positive),\n", - " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", - " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", - " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 6: Prepare plot-ready data\n", - "plot_data <- reporting_summary %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"Status\",\n", - " values_to = \"Percentage\"\n", - " ) %>%\n", - " mutate(\n", - " Status = recode(Status,\n", - " pct_missing = \"Valeur manquante\",\n", - " pct_zero = \"Valeur nulle rapportée\",\n", - " pct_positive = \"Valeur positive rapportée\")\n", - " ) %>%\n", - " complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cfd115e7-176d-4beb-9ab9-2e6990cb16af", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "options(repr.plot.width = 17, repr.plot.height = 10)\n", - "ggplot(plot_data, aes(x = DATE, y = Percentage, fill = Status)) +\n", - " geom_col(position = \"stack\") +\n", - " facet_wrap(~ INDICATOR, scales = \"free_y\", ncol = 4) +\n", - " scale_y_continuous() +\n", - " scale_fill_manual(values = c(\n", - " \"Valeur manquante\" = \"tomato\",\n", - " \"Valeur nulle rapportée\" = \"skyblue\",\n", - " \"Valeur positive rapportée\" = \"green\"\n", - " )) +\n", - " labs(\n", - " title = \"Taux de rapportage par indicateur (niveau formation sanitaire)\",\n", - " subtitle = \"Proportion des valeurs rapportées par mois et par indicateur\",\n", - " x = \"Mois\", y = \"% des formations sanitaires\",\n", - " fill = \"Statut du rapportage\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " strip.text = element_text(size = 16),\n", - " axis.title = element_text(size = 16),\n", - " axis.text = element_text(size = 16)\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "id": "e6871759-714b-437a-8b9c-5a5a06656567", - "metadata": {}, - "source": [ - "## 1.2 Proportion des districts ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c89f6c77-dd42-4616-8eb5-1642d5b51157", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 0: Rename for convenience\n", - "data <- routine_data\n", - "\n", - "# Step 1: Convert PERIOD to proper Date\n", - "data <- data %>%\n", - " mutate(Date = ymd(paste0(PERIOD, \"01\")))\n", - "\n", - "# Step 2: Identify indicator columns\n", - "indicator_cols <- setdiff(names(data), c(\n", - " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\",\n", - " \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"Date\"\n", - "))\n", - "\n", - "# Step 3: Reshape to long format\n", - "data_long <- data %>%\n", - " select(ADM2_ID, OU_ID, Date, all_of(indicator_cols)) %>%\n", - " pivot_longer(cols = all_of(indicator_cols),\n", - " names_to = \"Indicator\", values_to = \"value\") %>%\n", - " mutate(value = as.numeric(value))\n", - "\n", - "# Step 4: Full expected grid at ADM2 level\n", - "full_grid <- expand_grid(\n", - " ADM2_ID = unique(data_long$ADM2_ID),\n", - " Indicator = unique(data_long$Indicator),\n", - " Date = unique(data_long$Date)\n", - ")\n", - "\n", - "# Step 5: Detect if *any* health facility reported per district × indicator × date\n", - "reporting_check <- data_long %>%\n", - " group_by(ADM2_ID, Indicator, Date) %>%\n", - " summarise(\n", - " is_missing = all(is.na(value)),\n", - " is_zero = all(value == 0, na.rm = TRUE),\n", - " is_positive = any(value > 0, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 6: Join with full grid to fill in missing ADM2s\n", - "reporting_full <- full_grid %>%\n", - " left_join(reporting_check, by = c(\"ADM2_ID\", \"Indicator\", \"Date\")) %>%\n", - " mutate(\n", - " is_missing = replace_na(is_missing, TRUE),\n", - " is_zero = replace_na(is_zero, FALSE),\n", - " is_positive = replace_na(is_positive, FALSE)\n", - " )\n", - "\n", - "# Step 7: Summarise by Indicator and Date\n", - "reporting_summary <- reporting_full %>%\n", - " group_by(Indicator, Date) %>%\n", - " summarise(\n", - " n_total = n_distinct(ADM2_ID),\n", - " n_missing = sum(is_missing),\n", - " n_zero = sum(is_zero & !is_missing),\n", - " n_positive = sum(is_positive),\n", - " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", - " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", - " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 8: Reshape for plotting\n", - "plot_data <- reporting_summary %>%\n", - " pivot_longer(cols = starts_with(\"pct_\"),\n", - " names_to = \"Status\", values_to = \"Percentage\") %>%\n", - " mutate(Status = recode(Status,\n", - " pct_missing = \"Valeur manquante\",\n", - " pct_zero = \"Valeur nulle rapportée\",\n", - " pct_positive = \"Valeur positive rapportée\")) %>%\n", - " complete(Indicator, Date, Status, fill = list(Percentage = 0))\n", - "\n", - "# Step 9: Plot\n", - "ggplot(plot_data, aes(x = Date, y = Percentage, fill = Status)) +\n", - " geom_col(position = \"stack\") +\n", - " facet_wrap(~ Indicator, scales = \"free_y\") +\n", - " scale_y_continuous(limits = c(0, 100)) +\n", - " scale_fill_manual(values = c(\n", - " \"Valeur manquante\" = \"tomato\",\n", - " \"Valeur nulle rapportée\" = \"skyblue\",\n", - " \"Valeur positive rapportée\" = \"green\"\n", - " )) +\n", - " labs(\n", - " title = \"Taux de rapportage par indicateur (niveau district)\",\n", - " subtitle = \"Proportion des districts (ADM2_ID) rapportant chaque mois\",\n", - " x = \"Mois\", y = \"% des districts\",\n", - " fill = \"Statut du rapportage\"\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 18),\n", - " strip.text = element_text(size = 14),\n", - " axis.title = element_text(size = 14),\n", - " axis.text = element_text(size = 12)\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "id": "5cda3985", - "metadata": {}, - "source": [ - "# 2. Cohérence interne des indicateurs composites" - ] - }, - { - "cell_type": "markdown", - "id": "c131a633", - "metadata": {}, - "source": [ - "## 2.1 Filtrage préliminaire des valeurs aberrantes pour l’analyse de cohérence\n", - "\n", - "Avant d’évaluer la cohérence entre les indicateurs composites, nous éliminons d’abord les valeurs aberrantes les plus extrêmes. Cette étape ne modifie pas définitivement le jeu de données et ne vise pas à détecter toutes les valeurs aberrantes ; elle permet simplement d’exclure les cas extrêmes afin de faciliter une évaluation plus fiable de la cohérence entre les indicateurs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "936268f4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Function to detect outliers based on MAD method\n", - "detect_mad_outliers <- function(data_long, deviation = 15, outlier_column = \"mad_flag\") {\n", - " data_long %>%\n", - " group_by(OU, indicator, YEAR) %>%\n", - " mutate(\n", - " median_val = median(value, na.rm = TRUE),\n", - " mad_val = mad(value, na.rm = TRUE),\n", - " \"{outlier_column}\" := value > (median_val + deviation * mad_val) | value < (median_val - deviation * mad_val)\n", - " ) %>%\n", - " ungroup()\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "881f9625", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 0: Select relevant core indicators\n", - "target_indicators <- c(\"SUSP\", \"TEST\", \"CONF\", \"MALTREAT\", \"PRES\")\n", - "\n", - "# Step 1: Convert wide to long format\n", - "routine_long <- routine_data %>%\n", - " pivot_longer(\n", - " cols = all_of(target_indicators),\n", - " names_to = \"indicator\",\n", - " values_to = \"value\"\n", - " ) %>%\n", - " mutate(\n", - " PERIOD = as.character(PERIOD), # Ensure PERIOD is character for join\n", - " OU = OU_ID # Alias for join clarity\n", - " )\n", - "\n", - "# Step 2: Filter to indicators of interest\n", - "routine_long_filtered <- routine_long %>%\n", - " filter(indicator %in% target_indicators)\n", - "\n", - "# Step 3: Calculate MAD15\n", - "mad15_data <- detect_mad_outliers(\n", - " routine_long_filtered,\n", - " deviation = 15,\n", - " outlier_column = \"mad15\"\n", - ")\n", - "\n", - "# Step 4: Calculate MAD10 (only where mad15 not flagged or missing)\n", - "mad10_flags <- mad15_data %>%\n", - " filter(is.na(mad15) | mad15 == FALSE, !is.na(value)) %>%\n", - " detect_mad_outliers(deviation = 10, outlier_column = \"mad10\")\n", - "\n", - "# Step 5: Combine MAD15 and MAD10 results\n", - "mad_combined <- mad15_data %>%\n", - " left_join(\n", - " mad10_flags %>% select(PERIOD, OU, indicator, mad10),\n", - " by = c(\"PERIOD\", \"OU\", \"indicator\")\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04d41ed1", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 6: Identify outliers (MAD15 or MAD10 flagged as TRUE)\n", - "outlier_flags <- mad_combined %>%\n", - " filter(mad15 == TRUE | mad10 == TRUE) %>%\n", - " mutate(PERIOD = as.numeric(PERIOD)) %>%\n", - " select(PERIOD, OU, indicator)\n", - "\n", - "# Step 7: Reshape routine_data to long format for filtering\n", - "routine_long_all <- routine_data %>%\n", - " pivot_longer(\n", - " cols = all_of(target_indicators),\n", - " names_to = \"indicator\",\n", - " values_to = \"value\"\n", - " ) %>%\n", - " mutate(OU = OU_ID)\n", - "\n", - "# Step 8: Remove outliers\n", - "routine_long_clean <- routine_long_all %>%\n", - " anti_join(outlier_flags, by = c(\"PERIOD\", \"OU\", \"indicator\"))\n", - "\n", - "# Step 9: Reshape back to wide format if needed\n", - "routine_data_clean <- routine_long_clean %>%\n", - " select(-OU) %>%\n", - " pivot_wider(names_from = indicator, values_from = value)\n" - ] - }, - { - "cell_type": "markdown", - "id": "c6a5a77b", - "metadata": {}, - "source": [ - "## 2.2 Cohérence des indicateurs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6cfeb18e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: Extract year and month from PERIOD\n", - "routine_hd_month <- routine_data_clean %>%\n", - " mutate(\n", - " YEAR = substr(PERIOD, 1, 4),\n", - " MONTH = substr(PERIOD, 5, 6)\n", - " ) %>%\n", - " group_by(ADM2_ID, YEAR, MONTH) %>%\n", - " summarise(\n", - " SUSP = sum(SUSP, na.rm = TRUE),\n", - " TEST = sum(TEST, na.rm = TRUE),\n", - " CONF = sum(CONF, na.rm = TRUE),\n", - " MALTREAT = sum(MALTREAT, na.rm = TRUE),\n", - " PRES = sum(PRES, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 2: Create scatter plots\n", - "options(repr.plot.width = 14, repr.plot.height = 6)\n", - "\n", - "p1 <- ggplot(routine_hd_month, aes(x = SUSP, y = TEST)) +\n", - " geom_point(alpha = 0.5, color = \"blue\") +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", - " labs(title = \"Suspectés vs Testés\", x = \"Cas suspectés\", y = \"Cas testés\") +\n", - " theme_minimal(base_size = 16)\n", - "\n", - "p2 <- ggplot(routine_hd_month, aes(x = TEST, y = CONF)) +\n", - " geom_point(alpha = 0.5, color = \"darkgreen\") +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", - " labs(title = \"Testés vs Confirmés\", x = \"Cas testés\", y = \"Cas confirmés\") +\n", - " theme_minimal(base_size = 16)\n", - "\n", - "p3 <- ggplot(routine_hd_month, aes(x = CONF, y = MALTREAT)) +\n", - " geom_point(alpha = 0.5, color = \"purple\") +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", - " labs(title = \"Confirmés vs Traités\", x = \"Cas confirmés\", y = \"Cas traités\") +\n", - " theme_minimal(base_size = 16)\n", - "\n", - "# Step 3: Combine plots\n", - "(p1 | p2 | p3) + plot_layout(guides = \"collect\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0df24272", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: Aggregate monthly values\n", - "rds_clean_month <- routine_data_clean %>%\n", - " mutate(\n", - " YEAR = substr(PERIOD, 1, 4),\n", - " MONTH = substr(PERIOD, 5, 6),\n", - " DATE = as.Date(paste(YEAR, MONTH, \"01\", sep = \"-\"))\n", - " ) %>%\n", - " group_by(YEAR, MONTH, DATE) %>%\n", - " summarise(\n", - " SUSP = sum(SUSP, na.rm = TRUE),\n", - " TEST = sum(TEST, na.rm = TRUE),\n", - " CONF = sum(CONF, na.rm = TRUE),\n", - " PRES = sum(PRES, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 2: Plot monthly national trends\n", - "options(repr.plot.width = 14, repr.plot.height = 6)\n", - "rds_clean_month %>%\n", - " pivot_longer(cols = c(SUSP, TEST, CONF, PRES), names_to = \"Indicator\") %>%\n", - " ggplot(aes(x = DATE, y = value, color = Indicator)) +\n", - " geom_line(linewidth = 1.2) +\n", - " labs(\n", - " title = \"Tendances mensuelles nationales des indicateurs composites (après suppression des outliers)\",\n", - " x = \"Mois\", y = \"Nombre de cas\", color = \"Indicateur\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " axis.title = element_text(size = 16),\n", - " axis.text = element_text(size = 16),\n", - " legend.title = element_text(size = 16),\n", - " legend.text = element_text(size = 16)\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "id": "780fc9f8-6c67-4328-85f1-6bdefcd15b48", - "metadata": {}, - "source": [ - "# 3. Carte des populations par district sanitaire (DS)" - ] - }, - { - "cell_type": "markdown", - "id": "da58bbd3", - "metadata": {}, - "source": [ - "## 3.1. Carte de la Population pour ADM2 " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6965155d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Code from previous version of the notebook\n", - "# Uses continuos scale for population\n", - "\n", - "# Run if population_data is available\n", - "if (!is.null(population_data) & !is.null(shapes_data)) {\n", - " # Join population to spatial shapes\n", - " map_data <- shapes_data %>%\n", - " left_join(population_data, by = \"ADM2_ID\")\n", - " \n", - " # Plot population per district (DS)\n", - " plot <- ggplot(map_data) +\n", - " geom_sf(aes(fill = POPULATION), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis_c(option = \"C\", name = \"Population\") +\n", - " labs(\n", - " title = \"Population totale par district sanitaire (DS)\",\n", - " subtitle = \"Données DHIS2\",\n", - " caption = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " theme_minimal(base_size = 14) \n", - "\n", - " print(plot)\n", - "\n", - "} else {\n", - " print(\"Population or shapes data not available.\")\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "eb276692", - "metadata": {}, - "source": [ - "## ⚠️ 3.2. Carte de la Population Désagrégée (spécifique au pays)\n", - "Le code suivant est spécifique à chaque pays et repose sur une population désagrégée. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d33724e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "population_data_filtered <- population_data\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " print(\"🇳🇪 Executing NER specific code ... \")\n", - "\n", - " IRdisplay::display_markdown(\"\n", - " ### 🇳🇪 NER specific code \n", - " Made ad hoc to allow comparison with data from other or previous analyses. Namely:\n", - " * only year 2022 to 2024\n", - " * specific palette (yellowish to brick red)\n", - " * specific intervals\n", - " * looks at **disaggregated** population <- this is sometimes contry-specific!\n", - "\")\n", - "\n", - " # --- Filter data to keep only 2022-2024 ... ---\n", - " years_to_keep <- 2022:2024\n", - " population_data_filtered <- population_data |> filter(YEAR %in% years_to_keep)\n", - "\n", - " # --- Read data from SNT_metadata.json ---\n", - " metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - " # --- Assign population breaks from metadata ---\n", - " value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)\n", - " value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)\n", - " value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)\n", - "\n", - " # --- Define function to create dyanic labels based on breaks for pop category ---\n", - " create_dynamic_labels <- function(breaks) {\n", - " fmt <- function(x) {\n", - " format(x / 1000, big.mark = \"'\", scientific = FALSE, trim = TRUE)\n", - " }\n", - " \n", - " labels <- c(\n", - " paste0(\"< \", fmt(breaks[1]), \"k\"), # First label\n", - " paste0(fmt(breaks[-length(breaks)]), \" - \", fmt(breaks[-1]), \"k\"), # Middle\n", - " paste0(\"> \", fmt(breaks[length(breaks)]), \"k\") # Last label\n", - " ) \n", - " return(labels)\n", - " }\n", - "\n", - " # --- Create dynamic labels based on breaks ---\n", - " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", - " labels_u5 <- create_dynamic_labels(value_breaks_u5)\n", - " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0fdb96a0-873d-4f85-9c34-23c89c204c30", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "NER_palette_population <- c(\n", - " \"1\" = \"#fae6db\",\n", - " \"2\" = \"#f1b195\",\n", - " \"3\" = \"#ea7354\",\n", - " \"4\" = \"#cc3f32\",\n", - " \"5\" = \"#972620\"\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "95892df7-e5b8-4d7a-bf96-88673e633370", - "metadata": {}, - "source": [ - "### Population Totales" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0a196b8-2db5-478d-899a-48985d1735f0", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - " # IMPORTNAT: palette vector MUST be RENAMED with the (dynamic) descriptive labels\n", - "names(NER_palette_population) <- labels_tot\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION,\n", - " breaks = c(0, value_breaks_tot, Inf),\n", - " labels = labels_tot, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population totale par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_tot, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export to see better in high resolution\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_totals.png\")),\n", - " width = 14,\n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "aca477aa-4d93-4a74-ad8c-32a30f85a552", - "metadata": {}, - "source": [ - "### Population Femmes Enceintes (FE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9324a56b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - "names(NER_palette_population) <- labels_fe\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_FE,\n", - " breaks = c(0, value_breaks_fe, Inf),\n", - " labels = labels_fe, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_fe, \n", - " drop = FALSE # Prevents dropping empty levels from legend\n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export to see better in high resolution\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_fe.png\")),\n", - " width = 14, \n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "bd5fe86d-591a-4f5a-bc42-58180a413d5d", - "metadata": {}, - "source": [ - "### Population Enfants moins de 5 ans (U5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4046761f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - "names(NER_palette_population) <- labels_u5\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_U5,\n", - " breaks = c(0, value_breaks_u5, Inf),\n", - " labels = labels_u5, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_u5, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export PNG\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_u5.png\")),\n", - " width = 14, \n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "61e5ac12-c973-48e0-8c97-1af90e4b59a5", - "metadata": {}, - "source": [ - "## 3.2. Complétude et qualité des données de la Population" - ] - }, - { - "cell_type": "markdown", - "id": "0d86ed4a-e194-496b-9440-ad206157ee17", - "metadata": {}, - "source": [ - "#### Population Totale" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bec2759d-9ac4-42e1-9f7e-7076780bd7d6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# hist(population_data$POPULATION)\n", - "hist(population_data_filtered$POPULATION)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc00527f-d8f9-4c9e-bf4a-326c92cf8a68", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION,\n", - " y = fct_reorder(ADM2_NAME, POPULATION),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+05, 4e+05, 6e+05, 8e+05, 1e+06, 1.5e+06),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "# Export PNG\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", \"hist_population_totale.png\"),\n", - " units = \"cm\",\n", - " width = 15,\n", - " height = 23,\n", - " bg = \"white\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "d6ab387a-cc9e-42b9-a634-12af21bef0f5", - "metadata": {}, - "source": [ - "#### Population Femmes Enceintes (FE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6bb79dd-2d8a-4cd1-bf91-3e0e48c14eda", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Wrap in if statement to avoid errors if POPULATION_FE is missing\n", - "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", - " hist(population_data_filtered$POPULATION_FE)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4200afa2-e2f0-4876-9842-141b96f32fe8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", - " \n", - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION_FE,\n", - " y = fct_reorder(ADM2_NAME, POPULATION_FE),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+05, 1e+06, 1.5e+06),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(\n", - " # title = \"\"\n", - " color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "} " - ] - }, - { - "cell_type": "markdown", - "id": "e39305c0-3700-48c3-967a-b9c6af3e737f", - "metadata": {}, - "source": [ - "#### Population Enfants moins de 5 ans (U5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbda9b88-9b91-4845-83a8-795a12124999", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", - " hist(population_data_filtered$POPULATION_U5)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "742116ab-fef7-46ea-8c4b-0aa2a166005d", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", - "\n", - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION_U5,\n", - " y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+04, 1e+05, 1.5e+05),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(\n", - " # title = \"\"\n", - " color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "}" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r new file mode 100644 index 0000000..eb34c7e --- /dev/null +++ b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r @@ -0,0 +1,24 @@ +# Shared helpers for snt_dhis2_formatting reporting notebook. + +detect_mad_outliers <- function(data_long, deviation = 15, outlier_column = "mad_flag") { + data_long %>% + dplyr::group_by(OU, indicator, YEAR) %>% + dplyr::mutate( + median_val = median(value, na.rm = TRUE), + mad_val = mad(value, na.rm = TRUE), + "{outlier_column}" := value > (median_val + deviation * mad_val) | value < (median_val - deviation * mad_val) + ) %>% + dplyr::ungroup() +} + +create_dynamic_labels <- function(breaks) { + fmt <- function(x) { + format(x / 1000, big.mark = "'", scientific = FALSE, trim = TRUE) + } + + c( + paste0("< ", fmt(breaks[1]), "k"), + paste0(fmt(breaks[-length(breaks)]), " - ", fmt(breaks[-1]), "k"), + paste0("> ", fmt(breaks[length(breaks)]), "k") + ) +} From 489d4b66d4c19bd78a646af5f268f3140edd7e84 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Mon, 30 Mar 2026 14:56:06 +0200 Subject: [PATCH 16/23] milestone, fixing giulia nb --- .../code/NER_pyramid_format.ipynb | 39 +- .../reporting/snt_dhis2_extract_report.ipynb | 284 +- .../utils/snt_dhis2_extract.r | 48 + .../code/snt_dhis2_formatting_routine.ipynb | 1458 +++-- .../snt_dhis2_formatting_report.ipynb | 360 +- .../utils/snt_dhis2_formatting.r | 54 + .../utils/snt_dhis2_formatting_report.r | 4 + .../code/snt_dhis2_incidence.ipynb | 61 +- .../utils/snt_dhis2_incidence.r | 44 + ...is2_population_transformation_report.ipynb | 1235 +++-- ...t_dhis2_population_transformation_report.r | 18 + .../code/snt_dhis2_reporting_rate.ipynb | 4783 +++++++++-------- .../snt_dhis2_reporting_rate_report.ipynb | 2105 ++++---- .../utils/snt_dhis2_reporting_rate.r | 79 + 14 files changed, 5434 insertions(+), 5138 deletions(-) create mode 100644 pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r create mode 100644 pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r create mode 100644 pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation_report.r create mode 100644 pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r diff --git a/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb b/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb index 10aa7c9..be611d1 100644 --- a/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb +++ b/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb @@ -276,43 +276,8 @@ }, "outputs": [], "source": [ - "# inside_matrix: rows = points, columns = polygons\n", - "inside_matrix <- st_within(points_sf, polygons_sf, sparse = FALSE)\n", - "\n", - "# Check if points fall in a polygon\n", - "point_polygon_dict <- list()\n", - "for (i in seq_len(nrow(points_sf))) {\n", - " point_id <- points_sf$id[[i]]\n", - " point_name <- points_sf$name[[i]]\n", - " \n", - " # Which polygons contain this point\n", - " polygons_containing <- which(inside_matrix[i, ])\n", - " \n", - " if (length(polygons_containing) > 0) {\n", - " found_polygons <- polygons_sf[polygons_containing, ] \n", - " found_polygons_ds <- found_polygons[grepl(\"^DS\", found_polygons$name), ]\n", - " \n", - " if (nrow(found_polygons_ds) >= 1) { \n", - " polygon_id <- found_polygons_ds$id[1] # select the first match\n", - " polygon_name <- found_polygons_ds$name[1] \n", - " \n", - " # store in list\n", - " point_polygon_dict[[point_id]] <- list(\n", - " point_name = point_name,\n", - " polygon_id = polygon_id,\n", - " polygon_name = polygon_name\n", - " ) \n", - " print(glue(\"Point: {point_name} ({point_id}) is inside polygon: {polygon_name} ({polygon_id})\"))\n", - " \n", - " } else { \n", - " point_polygon_dict[[point_id]] <- list(point_name = point_name, polygon_id = NA, polygon_name = NA) \n", - " cat(\"Point:\", point_id, \"is not inside any district (DS) polygon\\n\")\n", - " }\n", - " } else {\n", - " point_polygon_dict[[point_id]] <- list(point_name = point_name, polygon_id = NA, polygon_name = NA)\n", - " cat(\"Point:\", point_id, \"is not inside any district (DS) polygon\\n\")\n", - " }\n", - "}\n" + "# Map each priority facility point to district polygons (DS)\n", + "point_polygon_dict <- map_points_to_ds_polygons(points_sf, polygons_sf)\n" ] }, { diff --git a/pipelines/snt_dhis2_extract/reporting/snt_dhis2_extract_report.ipynb b/pipelines/snt_dhis2_extract/reporting/snt_dhis2_extract_report.ipynb index 9afd77b..89cc27c 100644 --- a/pipelines/snt_dhis2_extract/reporting/snt_dhis2_extract_report.ipynb +++ b/pipelines/snt_dhis2_extract/reporting/snt_dhis2_extract_report.ipynb @@ -2,33 +2,36 @@ "cells": [ { "cell_type": "markdown", + "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", "metadata": {}, "source": [ "## **Extraction des données de routine**" - ], - "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "43794265-533f-4035-bf3d-975a3409507b", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "## CONFIGURATION ##" - ], - "execution_count": null, - "outputs": [], - "id": "43794265-533f-4035-bf3d-975a3409507b" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "2ced7513-0ee6-4b9b-ac07-124e510119af", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", @@ -51,18 +54,18 @@ "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", "reticulate::py_config()$python\n", "openhexa <- import(\"openhexa.sdk\")" - ], - "execution_count": null, - "outputs": [], - "id": "2ced7513-0ee6-4b9b-ac07-124e510119af" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9e6b91b3-c196-4a1f-bc3d-a4bec5b90e51", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Load SNT config\n", "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", @@ -79,32 +82,32 @@ "ADM_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", "facility_level <- config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL" - ], - "execution_count": null, - "outputs": [], - "id": "9e6b91b3-c196-4a1f-bc3d-a4bec5b90e51" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f8edc2a5-07ce-4507-9939-4322fc510593", "metadata": { "vscode": { "languageId": "r" } }, - "source": [ - "# printdim() loaded from code/snt_utils.r" - ], - "execution_count": null, "outputs": [], - "id": "f8edc2a5-07ce-4507-9939-4322fc510593" + "source": [ + "# printdim() loaded from pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "25362e00-96b5-4200-be45-cdeeff9ce3ac", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# import analytics DHIS2 data\n", "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_analytics.parquet\")) }, \n", @@ -133,26 +136,26 @@ "printdim(routine_data)\n", "printdim(pyramid_data)\n", "#printdim(reporting_data)" - ], - "execution_count": null, - "outputs": [], - "id": "25362e00-96b5-4200-be45-cdeeff9ce3ac" + ] }, { "cell_type": "markdown", + "id": "c3cee574-8d66-4cd5-8fe6-97f39daa158b", "metadata": {}, "source": [ "### 1. Liste des éléments de donnée extraits" - ], - "id": "c3cee574-8d66-4cd5-8fe6-97f39daa158b" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "5ef732f5-52a8-4abc-87ba-7ca77f6c85f2", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# 1. Extract the list of categories and their DX codes\n", "category_elements <- map(indicator_defs, ~ .x) # safely preserve all vectors\n", @@ -178,18 +181,18 @@ " caption = \"Liste des éléments de données extraits, classés par indicateur\",\n", " col.names = c(\"ID de l'élément\", \"Nom de l'élément de donnée\", \"Indicateur\")\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "5ef732f5-52a8-4abc-87ba-7ca77f6c85f2" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "fd12633d-8aa2-4ac9-91c4-e1c651031275", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Simple table of data elements and their disaggregations\n", "disaggregations_table <- routine_data %>%\n", @@ -204,26 +207,26 @@ "\n", "# Display\n", "disaggregations_table" - ], - "execution_count": null, - "outputs": [], - "id": "fd12633d-8aa2-4ac9-91c4-e1c651031275" + ] }, { "cell_type": "markdown", + "id": "35bcc286-cde1-47bd-99ab-3a6f6b39ac5d", "metadata": {}, "source": [ "### 2. Période de couverture des données" - ], - "id": "35bcc286-cde1-47bd-99ab-3a6f6b39ac5d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "ae59b2f0-0d97-4e04-a7ea-aa136b03cc68", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Mois minimum et maximum dans le jeu de données\n", "cat(\"Premier mois pour lequel les données ont été extraites :\", min(routine_data$PE), \"\\n\")\n", @@ -235,26 +238,26 @@ " ymd(paste0(max(routine_data$PE), \"01\")),\n", " by = \"1 month\") %>%\n", " format(\"%Y%m\")" - ], - "execution_count": null, - "outputs": [], - "id": "ae59b2f0-0d97-4e04-a7ea-aa136b03cc68" + ] }, { "cell_type": "markdown", + "id": "05f6938d-046b-4742-b8cb-3840a3646fb7", "metadata": {}, "source": [ "### 3. Résumé hierarchique" - ], - "id": "05f6938d-046b-4742-b8cb-3840a3646fb7" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "7533a147-6e04-4789-8f3a-e4687fb886b9", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Map NAME -> ID (robust if already *_ID)\n", "adm1_id <- ifelse(str_ends(ADM_1, \"_ID\"), ADM_1, str_replace(ADM_1, \"_NAME$\", \"_ID\"))\n", @@ -294,26 +297,26 @@ " \"\\nNote : ADM_1 est mappé sur `{ADM_1}` → `{adm1_id}`, ADM_2 sur `{ADM_2}` → `{adm2_id}`. \",\n", " \"Le niveau opérationnel des formations sanitaires est L{facility_level}.\\n\"\n", "))" - ], - "execution_count": null, - "outputs": [], - "id": "7533a147-6e04-4789-8f3a-e4687fb886b9" + ] }, { "cell_type": "markdown", + "id": "c413e780-7a1a-4241-a06b-274e77d41b50", "metadata": {}, "source": [ "### 4. Nombre et activité des formations sanitaires" - ], - "id": "c413e780-7a1a-4241-a06b-274e77d41b50" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "239d062c-143d-4a7c-ac93-bee280e1d57a", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Nombre total de formations sanitaires uniques selon le niveau organisationnel défini dans la pyramide\n", "total_facilities <- pyramid_data %>% \n", @@ -326,18 +329,18 @@ " \"c’est-à-dire le niveau {facility_level} de la hiérarchie sanitaire. \",\n", " \"Au total, {total_facilities} formations sanitaires uniques ont été identifiées à ce niveau.\"\n", "))" - ], - "execution_count": null, - "outputs": [], - "id": "239d062c-143d-4a7c-ac93-bee280e1d57a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4bc914d8-852d-4615-bcba-64bb2d33c56c", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Vérification de l’activité : une formation sanitaire est considérée comme « active »\n", "# si elle a rapporté au moins une valeur (y compris zéro) pendant la période spécifiée.\n", @@ -365,18 +368,18 @@ " \"dans les données de routine ({period_start}–{period_end}), \",\n", " \"soit {round(proportion_active, 1)} % d’établissements ayant effectivement transmis des données.\"\n", "))" - ], - "execution_count": null, - "outputs": [], - "id": "4bc914d8-852d-4615-bcba-64bb2d33c56c" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "03179c36-7870-4a37-a075-44420d01a9c4", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Years from routine (already fine)\n", "yrs_rout <- sort(unique(as.integer(substr(routine_data$PE, 1, 4))))\n", @@ -415,18 +418,18 @@ "\n", "kable(reconciliation,\n", " caption = \"Ouverture (pyramide) vs. rapportage effectif (routine), par année\")\n" - ], - "execution_count": null, - "outputs": [], - "id": "03179c36-7870-4a37-a075-44420d01a9c4" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "3c413a37-7edd-48b1-981c-c7e30661dee7", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# --- Make sure VALUE is treated as numeric where possible (silently)\n", "routine_data <- routine_data %>%\n", @@ -479,18 +482,18 @@ " mutate(denom_line = first(denom_year)) %>%\n", " ungroup() %>%\n", " mutate(PE = factor(PE, levels = sort(unique(PE)))) # keep month order" - ], - "execution_count": null, - "outputs": [], - "id": "3c413a37-7edd-48b1-981c-c7e30661dee7" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "0f724464-b943-44a0-83e4-b500cf53d9db", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "monthly_reporting_by_year <- monthly_reporting_by_year %>%\n", " dplyr::group_by(year) %>%\n", @@ -529,18 +532,18 @@ " ) +\n", " theme_minimal(base_size = 13) +\n", " theme(axis.text.x = element_text(angle = 45, hjust = 1))\n" - ], - "execution_count": null, - "outputs": [], - "id": "0f724464-b943-44a0-83e4-b500cf53d9db" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "343b759e-ae3d-4201-a602-a3869acc4489", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "options(repr.plot.width = 13, repr.plot.height = 8)\n", "ggplot(monthly_reporting_by_year, aes(x = PE, y = pct_reporting)) +\n", @@ -558,18 +561,18 @@ " axis.text.x = element_text(angle = 45, hjust = 1),\n", " panel.grid.minor = element_blank()\n", " )\n" - ], - "execution_count": null, - "outputs": [], - "id": "343b759e-ae3d-4201-a602-a3869acc4489" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c0324990-c5c5-461a-ba79-6695717e0bce", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "if (config_json$SNT_CONFIG$COUNTRY_CODE == \"NER\") {\n", "\n", @@ -590,13 +593,11 @@ "} else {\n", " cat(\"Cette section n'est pas applicable : la structure pyramidale diffère pour ce pays.\")\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "c0324990-c5c5-461a-ba79-6695717e0bce" + ] }, { "cell_type": "markdown", + "id": "1be4d840-74a8-4a61-bf1a-af556ef270bc", "metadata": {}, "source": [ "### 5. Complétude de l'extraction des données de routine au niveau des formations sanitaires\n", @@ -611,30 +612,32 @@ "Le nombre total de formations sanitaires reste constant, correspondant à celles ayant transmis au moins une donnée sur la période d’analyse.\n", "\n", "Les graphiques ci-dessous illustrent, pour chaque indicateur SNIS, la proportion relative de ces trois types de valeurs au fil du temps, permettant d’évaluer la complétude et la cohérence des données extraites avant tout traitement analytique." - ], - "id": "1be4d840-74a8-4a61-bf1a-af556ef270bc" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "3dec0947-f19a-40fe-8927-1ea0efdca904", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "options(jupyter.plot_mimetypes = c(\"image/png\"))" - ], - "execution_count": null, - "outputs": [], - "id": "3dec0947-f19a-40fe-8927-1ea0efdca904" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "b50e66f0-203c-44c4-9511-2532ea145980", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# --- 🚨 (NEW) STEP 1: *GP* sum up VALUEs of each INDICATOR (DX_NAME) by CO!! 🚨\n", "routine_data <- routine_data %>%\n", @@ -642,18 +645,18 @@ " summarise(VALUE = sum(as.numeric(VALUE)),\n", " .groups = \"drop\") |>\n", "mutate(INDICATOR = DX_NAME)" - ], - "execution_count": null, - "outputs": [], - "id": "b50e66f0-203c-44c4-9511-2532ea145980" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "61d23bf9-14d1-4248-8232-bba1b95d837a", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# --- STEP 2: Build expected full grid (OU × INDICATOR × DATE)\n", "full_grid <- expand_grid(\n", @@ -661,18 +664,18 @@ " INDICATOR = unique(routine_data$INDICATOR),\n", " PE = unique(routine_data$PE)\n", ")" - ], - "execution_count": null, - "outputs": [], - "id": "61d23bf9-14d1-4248-8232-bba1b95d837a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "fa0504b6-baf3-41d0-85b9-a8ffe6a88c11", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# --- STEP 3: Join to detect missing / zero / positive\n", "reporting_check <- full_grid %>%\n", @@ -687,18 +690,18 @@ " is_zero = VALUE == 0 & !is.na(VALUE),\n", " is_positive = VALUE > 0 & !is.na(VALUE)\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "fa0504b6-baf3-41d0-85b9-a8ffe6a88c11" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "12bf998f-79da-4aa7-967e-ee50acaab000", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# --- STEP 4: Summarise by INDICATOR and date\n", "reporting_summary <- reporting_check %>%\n", @@ -715,18 +718,18 @@ " # pct_total = sum(pct_missing, pct_zero, pct_positive), # sanity check: should be always == 100\n", " .groups = \"drop\"\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "12bf998f-79da-4aa7-967e-ee50acaab000" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "0504ad04-910e-4ec7-9448-397228baf541", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# --- STEP 5: Reshape for stacked plot\n", "plot_data <- reporting_summary %>%\n", @@ -742,35 +745,35 @@ " ) %>%\n", " # complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n", " complete(INDICATOR, PE, Status, fill = list(Percentage = 0))" - ], - "execution_count": null, - "outputs": [], - "id": "0504ad04-910e-4ec7-9448-397228baf541" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "fcd8e259-80ff-4615-a568-9fe63ad0a9f3", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "plot_data <- plot_data %>%\n", " left_join(classified_elements %>% distinct(DX_NAME, Categorie),\n", " by = c(\"INDICATOR\" = \"DX_NAME\"),\n", " relationship = \"many-to-many\")" - ], - "execution_count": null, - "outputs": [], - "id": "fcd8e259-80ff-4615-a568-9fe63ad0a9f3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "8af82e0d-17cf-403e-a2f3-d76a90e915ea", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "categories <- plot_data %>%\n", " filter(!is.na(Categorie)) %>%\n", @@ -806,37 +809,35 @@ " axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)\n", " )\n", "})\n" - ], - "execution_count": null, - "outputs": [], - "id": "8af82e0d-17cf-403e-a2f3-d76a90e915ea" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c4b1db68-6ffe-4e6e-be81-30e615863fbf", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Example: show the first category plot\n", "options(repr.plot.width = 15, repr.plot.height = 5)\n", "walk(plots_by_category, print)" - ], - "execution_count": null, - "outputs": [], - "id": "c4b1db68-6ffe-4e6e-be81-30e615863fbf" + ] }, { "cell_type": "markdown", + "id": "727c614b-c1fd-4c17-8414-1f6478f664c3", "metadata": {}, "source": [ "### 6. Disponibilité des données par formation sanitaire (sur la période analysée)" - ], - "id": "727c614b-c1fd-4c17-8414-1f6478f664c3" + ] }, { "cell_type": "markdown", + "id": "dbb43faa-444e-42ec-902f-34a44e3fa355", "metadata": {}, "source": [ "Cette section évalue la disponibilité des données de routine pour chaque formation sanitaire sur l’ensemble de la période analysée.\n", @@ -846,16 +847,18 @@ "- Les couleurs vont du jaune (100 %), indiquant une disponibilité complète, au violet (0 %), indiquant une absence totale de données sur la période.\n", "\n", "Ce diagnostic permet d’identifier les formations sanitaires avec des problèmes chroniques de rapportage ou des interruptions prolongées dans la saisie des données." - ], - "id": "dbb43faa-444e-42ec-902f-34a44e3fa355" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "071be12f-8967-4580-bc2d-43dd07c1f855", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# How many distinct months are in the analysis window?\n", "n_months <- dplyr::n_distinct(routine_data$PE)\n", @@ -914,21 +917,19 @@ " panel.grid = element_blank(),\n", " legend.position = \"right\"\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "071be12f-8967-4580-bc2d-43dd07c1f855" + ] }, { "cell_type": "markdown", + "id": "7a5ed92b-f048-4467-8d3a-f86c6a4fe141", "metadata": {}, "source": [ "### 7. Tendances nationales et mensuelles par élément de données" - ], - "id": "7a5ed92b-f048-4467-8d3a-f86c6a4fe141" + ] }, { "cell_type": "markdown", + "id": "6371c0e1-a5a7-423e-9df3-fc9eb8214bb6", "metadata": {}, "source": [ "Cette section présente l’évolution temporelle des valeurs mensuelles totales pour chaque indicateur de paludisme au cours de la période analysée. Les courbes montrent la somme des valeurs rapportées à travers toutes les formations sanitaires et toutes les désagrégations.\n", @@ -936,16 +937,18 @@ "- L’axe horizontal représente le temps (mois), et l’axe vertical le total des valeurs rapportées pour l’ensemble du pays.\n", "\n", "Ces tendances permettent de visualiser les fluctuations saisonnières et d’identifier d’éventuelles anomalies ou ruptures dans la dynamique des cas rapportés." - ], - "id": "6371c0e1-a5a7-423e-9df3-fc9eb8214bb6" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "e56014d5-de3f-45a1-8a4f-a05f4ecc78d8", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# routine data\n", "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_analytics.parquet\")) }, \n", @@ -954,34 +957,34 @@ " cat(msg)\n", " stop(msg)\n", " })" - ], - "execution_count": null, - "outputs": [], - "id": "e56014d5-de3f-45a1-8a4f-a05f4ecc78d8" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "582f4c5e-b650-4422-bc94-873fd1f0d399", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Outlier removal removed: keep raw values as reported\n", "routine_data <- routine_data %>%\n", " mutate(VALUE = suppressWarnings(as.numeric(VALUE))) # ensure numeric\n" - ], - "execution_count": null, - "outputs": [], - "id": "582f4c5e-b650-4422-bc94-873fd1f0d399" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1cd7f87b-3ea0-4c32-869d-79f73c9fd016", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "### 1. Build mapping table (Categorie, DX, optional CO)\n", "indicator_map <- imap_dfr(indicator_defs, function(dx_list, categorie) {\n", @@ -1126,18 +1129,18 @@ " ) +\n", " guides(color = guide_legend(ncol = 1, byrow = TRUE, override.aes = list(size = 3, alpha = 1)))\n", " })" - ], - "execution_count": null, - "outputs": [], - "id": "1cd7f87b-3ea0-4c32-869d-79f73c9fd016" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4b6b396d-91a3-4839-849b-f720b4ca61b5", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Export each category plot with dynamic canvas to keep legends readable\n", "if (!dir.exists(FIGURES_PATH)) {\n", @@ -1176,10 +1179,7 @@ " log_msg(glue::glue(\"Plot exporte: {file.path(FIGURES_PATH, file_name)}\"))\n", " }\n", ")" - ], - "execution_count": null, - "outputs": [], - "id": "4b6b396d-91a3-4839-849b-f720b4ca61b5" + ] } ], "metadata": { @@ -1199,4 +1199,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r b/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r index f72eb97..54304b7 100644 --- a/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r +++ b/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r @@ -1,5 +1,9 @@ # Shared helpers for snt_dhis2_extract notebooks. +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + make_point_geojson <- function(lat, lon) { sprintf('{"type": "Point", "coordinates": [%f, %f]}', lon, lat) } @@ -71,3 +75,47 @@ norm_fosa_type <- function(x) { TRUE ~ "Autre" ) } + +map_points_to_ds_polygons <- function(points_sf, polygons_sf) { + inside_matrix <- sf::st_within(points_sf, polygons_sf, sparse = FALSE) + point_polygon_dict <- list() + + for (i in seq_len(nrow(points_sf))) { + point_id <- points_sf$id[[i]] + point_name <- points_sf$name[[i]] + polygons_containing <- which(inside_matrix[i, ]) + + if (length(polygons_containing) > 0) { + found_polygons <- polygons_sf[polygons_containing, ] + found_polygons_ds <- found_polygons[grepl("^DS", found_polygons$name), ] + + if (nrow(found_polygons_ds) >= 1) { + polygon_id <- found_polygons_ds$id[1] + polygon_name <- found_polygons_ds$name[1] + + point_polygon_dict[[point_id]] <- list( + point_name = point_name, + polygon_id = polygon_id, + polygon_name = polygon_name + ) + print(glue::glue("Point: {point_name} ({point_id}) is inside polygon: {polygon_name} ({polygon_id})")) + } else { + point_polygon_dict[[point_id]] <- list( + point_name = point_name, + polygon_id = NA, + polygon_name = NA + ) + cat("Point:", point_id, "is not inside any district (DS) polygon\n") + } + } else { + point_polygon_dict[[point_id]] <- list( + point_name = point_name, + polygon_id = NA, + polygon_name = NA + ) + cat("Point:", point_id, "is not inside any district (DS) polygon\n") + } + } + + point_polygon_dict +} diff --git a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb index 50f4258..c9c27bb 100644 --- a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb +++ b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb @@ -1,755 +1,713 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "96eee97c-fda4-4827-8111-c438cabed82e", - "metadata": {}, - "source": [ - "## Setup start " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1156ab5-2dc6-4bfb-8d7a-ac594c40ecf8", - "metadata": { - "tags": [ - "parameters" - ], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Parameters\n", - "# This cell is tagged 'parameters' for Papermill to inject SNT_ROOT_PATH\n", - "SNT_ROOT_PATH <- '~/workspace' # SNT root" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "687392e7-fe6c-4355-9f4d-6718b467a33d", - "metadata": { - "tags": [ - "parameters" - ], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set project folders\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "FORMATTED_DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\", \"extracts_formatted\")" - ] - }, - { - "cell_type": "markdown", - "id": "7a31fc88-3fa9-4a65-b61a-dc1564aecc22", - "metadata": {}, - "source": [ - "**Load functions**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "340a8095-ee83-4277-84cd-60c0c626b947", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "source(file.path(CODE_PATH, \"snt_utils.r\"))" - ] - }, - { - "cell_type": "markdown", - "id": "1188d74a-0d28-48c7-a72e-3c1933b87cf4", - "metadata": {}, - "source": [ - "**Check and load required libraries** " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "715c8d64-aaa0-43e6-b398-59abfb564bae", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "96eee97c-fda4-4827-8111-c438cabed82e", + "metadata": {}, + "source": [ + "## Setup start " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1156ab5-2dc6-4bfb-8d7a-ac594c40ecf8", + "metadata": { + "tags": [ + "parameters" + ], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Parameters\n", + "# This cell is tagged 'parameters' for Papermill to inject SNT_ROOT_PATH\n", + "SNT_ROOT_PATH <- '~/workspace' # SNT root" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "687392e7-fe6c-4355-9f4d-6718b467a33d", + "metadata": { + "tags": [ + "parameters" + ], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set project folders\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_formatting\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "FORMATTED_DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\", \"extracts_formatted\")" + ] + }, + { + "cell_type": "markdown", + "id": "7a31fc88-3fa9-4a65-b61a-dc1564aecc22", + "metadata": {}, + "source": [ + "**Load functions**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "340a8095-ee83-4277-84cd-60c0c626b947", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_formatting.r\"))" + ] + }, + { + "cell_type": "markdown", + "id": "1188d74a-0d28-48c7-a72e-3c1933b87cf4", + "metadata": {}, + "source": [ + "**Check and load required libraries** " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "715c8d64-aaa0-43e6-b398-59abfb564bae", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List required pcks \n", + "required_packages <- c(\"lubridate\", \"zoo\", \"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"reticulate\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1cf7e13-3be0-44fa-99ea-3e3520da1229", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "markdown", + "id": "21c802bd-506b-4e60-bd06-b715a5c197ee", + "metadata": {}, + "source": [ + "### Load SNT configuration\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10580e48-ccb5-49df-933d-3cdbc480a402", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\"))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "markdown", + "id": "f9d3da29-c02b-44ca-b6ee-a00b990c480a", + "metadata": {}, + "source": [ + "**Checks for SNT mandatory configuration fields**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "590aa36d-74e0-4616-b60e-628e26201c52", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# CHECK SNT configuration \n", + "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") #, \"ORG_UNITS_LEVELS_SELECTION\")\n", + "for (conf in snt_config_mandatory) {\n", + " print(paste(conf, \":\", config_json$SNT_CONFIG[conf]))\n", + " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", + " msg <- paste(\"Missing configuration input:\", conf)\n", + " cat(msg) \n", + " stop(msg)\n", + " }\n", + "}\n", + "\n", + "# Save this country code in a variable\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" + ] + }, + { + "cell_type": "markdown", + "id": "0b8083d8-ba7f-49d9-865b-ada38c51b6b2", + "metadata": {}, + "source": [ + "### Load DHIS2 analytics data" + ] + }, + { + "cell_type": "markdown", + "id": "77af1690-79c4-4ad4-92c3-47a5dd119906", + "metadata": {}, + "source": [ + "-Load DHIS2 anlytics from latest dataset version \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82d0d5e9-2cc5-4101-9ea6-59aafdcf5b81", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# DHIS2 Dataset extract identifier\n", + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", + "\n", + "# Load file from dataset\n", + "dhis2_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_analytics.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 analytics file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "msg <- paste0(\"DHIS2 analytics data loaded from dataset : \", dataset_name, \" dataframe dimensions: \", paste(dim(dhis2_data), collapse=\", \"))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "markdown", + "id": "0d351974-c1c5-4971-97c8-f0122ca9e803", + "metadata": {}, + "source": [ + "## SNT Indicators computation" + ] + }, + { + "cell_type": "markdown", + "id": "3c8c5b6f-395b-43bf-a349-a6f059b4fe5a", + "metadata": {}, + "source": [ + "### Select dhis2 metadata " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "328ce7f1-a010-44c4-92e2-3b31829b32d8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# log\n", + "msg <- paste0(\"Computing SNT indicators.\")\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a076591f-b6df-49cb-8874-627b61a03356", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Select only metadata (reduce the size of the dataframe)\n", + "administrative_cols <- colnames(dhis2_data)[grepl(\"LEVEL_\", colnames(dhis2_data))]\n", + "dhis2_metadata <- dhis2_data[ , c(\"OU\", administrative_cols)] # Metadata\n", + "dhis2_metadata <- distinct(dhis2_metadata)\n", + "dim(dhis2_metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7dbc619-d440-49f0-983b-18e9f264edaa", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Max admin columns available (matchin ou)\n", + "name_cols <- grep(\"LEVEL_\\\\d+_NAME\", administrative_cols, value = TRUE)\n", + "max_level <- max(as.numeric(gsub(\"LEVEL_(\\\\d+)_NAME\", \"\\\\1\", name_cols)))\n", + "max_admin_col_name <- paste0(\"LEVEL_\", max_level, \"_NAME\")\n", + "\n", + "# Result\n", + "print(max_admin_col_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6a94a3a-0078-4a25-af01-bbe49d55d4b4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Clean strings for admin 1 and admin 2\n", + "dhis2_metadata[[ADMIN_1]] <- format_names(dhis2_metadata[[ADMIN_1]]) # (format_names() in snt_utils.r)\n", + "dhis2_metadata[[ADMIN_2]] <- format_names(dhis2_metadata[[ADMIN_2]])" + ] + }, + { + "cell_type": "markdown", + "id": "4cabdecd-e4ba-43c4-9877-302e9854d07d", + "metadata": {}, + "source": [ + "### Select dhis2 values data " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4228556-f14e-4fbd-83b6-f99659a01b10", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# dhis2 Values table\n", + "dhis2_values <- dhis2_data[ , c(\"DX\", \"CO\", \"OU\", \"PE\", \"VALUE\")]\n", + "head(dhis2_values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3cca485-87a8-4914-b389-e265235461cd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "unique(dhis2_values$DX)" + ] + }, + { + "cell_type": "markdown", + "id": "63254824-df25-4e48-a0ff-05538e7cd3f8", + "metadata": {}, + "source": [ + "### Pivot dhis2 value table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6ceabbd-5c08-406c-b8e9-bd5615008959", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# make sure we have numeric data in \"values\" column\n", + "dhis2_values$VALUE <- as.numeric(dhis2_values$VALUE)\n", + "\n", + "# pivot table on DX and CO columns (available combinations to columns)\n", + "routine_data <- pivot_wider(dhis2_values,\n", + " id_cols = all_of(c(\"OU\", \"PE\")),\n", + " names_from = c(\"DX\", \"CO\"),\n", + " values_from = 'VALUE')\n", + "\n", + "print(paste(\"Routine data pivot : \", paste0(dim(routine_data), collapse=\", \")))" + ] + }, + { + "cell_type": "markdown", + "id": "cdeeb35d-975f-4929-8d69-a53dfae9d7fa", + "metadata": {}, + "source": [ + "### Build indicator definitions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9ffed9b-df31-4f86-bfec-4b1f27150c7d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# copy\n", + "routine_data_ind <- routine_data\n", + "\n", + "# Get list of indicator definitions from SNT configuration\n", + "dhis_indicator_definitions <- config_json$DHIS2_DATA$DHIS2_INDICATOR_DEFINITIONS\n", + "names(dhis_indicator_definitions) <- toupper(names(dhis_indicator_definitions))" + ] + }, + { + "cell_type": "markdown", + "id": "401e0b96-3b0b-4a5c-bccc-e31e251c9d78", + "metadata": {}, + "source": [ + "**Remove empty indicators from the list**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c6584bf-9f8c-47d9-86a3-7dabbe7f2839", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhis_indicator_definitions_clean <- dhis_indicator_definitions\n", + "empty_indicators <- c()\n", + "\n", + "# Loop over the indicators and clean the list\n", + "for (name in names(dhis_indicator_definitions_clean)) {\n", + " value <- dhis_indicator_definitions_clean[[name]]\n", + " \n", + " # If value is NULL or length zero, leave as is or set to NULL\n", + " if (is.null(value) || length(value) == 0 || all(value == \"\")) {\n", + " dhis_indicator_definitions_clean[[name]] <- NULL \n", + " empty_indicators <- c(empty_indicators, name)\n", + " next\n", + " } \n", + " # Trim whitespace and then check if empty string\n", + " value_trimmed <- trimws(value)\n", + " dhis_indicator_definitions_clean[[name]] <- value_trimmed \n", + "}\n", + "\n", + "print(\"Complete indicator definitions:\")\n", + "print(names(dhis_indicator_definitions_clean))\n", + "print(\"Empty indicator definitions: \")\n", + "print(empty_indicators)" + ] + }, + { + "cell_type": "markdown", + "id": "ad3f0f9c-b67b-4f0e-8c94-356ae0949165", + "metadata": {}, + "source": [ + "**Start indicators loop**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ccd7f24-92d7-4ffa-b502-12fd0a4678b0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Build configured routine indicators from DHIS2 data definitions\n", + "build_result <- build_routine_indicators(\n", + " routine_data_ind = routine_data_ind,\n", + " dhis_indicator_definitions_clean = dhis_indicator_definitions_clean\n", + ")\n", + "routine_data_ind <- build_result$routine_data_ind\n", + "empty_data_indicators <- build_result$empty_data_indicators" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51c72216-4531-4bce-8d24-15f76b5488f3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add the empty indicator columns (if not needed this can be commented)\n", + "for (empty_indicator in empty_indicators) {\n", + " routine_data_ind[empty_indicator] <- NA\n", + " \n", + " # logs\n", + " msg <- paste0(\"Building indicator : \", empty_indicator, \" -> column selection : NULL\")\n", + " log_msg(msg)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cee9752-b5e7-41e3-984b-2008206ffd62", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "print(dim(routine_data_ind))\n", + "head(routine_data_ind, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "e7b19575-978e-4507-b486-58b1be866912", + "metadata": {}, + "source": [ + "## Format SNT routine data" + ] + }, + { + "cell_type": "markdown", + "id": "c80c3ef3-c7a6-4845-9134-3900bfba5eef", + "metadata": {}, + "source": [ + "### SNT format " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7073fec-5ac9-4553-894b-bb04e4e351b0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Filter routine data columns by indicators\n", + "built_indicators <- names(dhis_indicator_definitions)[!(names(dhis_indicator_definitions) %in% empty_data_indicators)]\n", + "routine_data_selection <- routine_data_ind[, c(\"OU\", \"PE\", built_indicators)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3dbebbdd-6352-45cd-b64d-3d7253c5dbe0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# left join with metadata\n", + "routine_data_merged <- merge(routine_data_selection, dhis2_metadata, by = \"OU\", all.x = TRUE)\n", + "\n", + "# Select administrative columns\n", + "adm_1_id_col <- gsub(\"_NAME\", \"_ID\", ADMIN_1)\n", + "adm_1_name_col <- ADMIN_1\n", + "adm_2_id_col <- gsub(\"_NAME\", \"_ID\", ADMIN_2)\n", + "adm_2_name_col <- ADMIN_2\n", + "\n", + "# Select and Rename\n", + "routine_data_formatted <- routine_data_merged %>%\n", + " mutate( \n", + " YEAR = as.numeric(substr(PE, 1, 4)),\n", + " MONTH = as.numeric(substr(PE, 5, 6)),\n", + " PE = as.numeric(PE)\n", + " ) %>%\n", + " select(\n", + " PERIOD = PE,\n", + " YEAR,\n", + " MONTH,\n", + " OU_ID = OU,\n", + " OU_NAME = !!sym(max_admin_col_name),\n", + " ADM1_NAME = !!sym(adm_1_name_col),\n", + " ADM1_ID = !!sym(adm_1_id_col),\n", + " ADM2_NAME = !!sym(adm_2_name_col),\n", + " ADM2_ID = !!sym(adm_2_id_col),\n", + " all_of(built_indicators)\n", + " )\n", + "\n", + "# Column names to upper case\n", + "colnames(routine_data_formatted) <- clean_column_names(routine_data_formatted)\n", + "\n", + "# Sort dataframe by period\n", + "routine_data_formatted <- routine_data_formatted[order(as.numeric(routine_data_formatted$PERIOD)), ]\n", + "print(dim(routine_data_formatted))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cd73417-e12c-4ac3-bf74-b4773aeb61e6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add the empty data indicator columns (if not needed this can be commented)\n", + "for (empty_data_indicator in empty_data_indicators) {\n", + " routine_data_formatted[empty_data_indicator] <- NA \n", + " # logs\n", + " print(paste0(\"Set indicator \", empty_data_indicator, \" to : NULL\"))\n", + " # log_msg(msg)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f36fc1ee-c2e0-4cda-ad6e-a14bfb54896d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(routine_data_formatted,3)" + ] + }, + { + "cell_type": "markdown", + "id": "853b9f8c-01a0-41de-9b6b-3a506f1a36e4", + "metadata": {}, + "source": [ + "### Output formatted routine data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d704604-8af9-48bf-afc4-3965a634ac75", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "out_msg <- paste0(\"Rountine data saved under: \", file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, \"_routine.parquet\")))\n", + "\n", + "# write parquet file\n", + "write_parquet(routine_data_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, \"_routine.parquet\")))\n", + "\n", + "# write csv file\n", + "write.csv(routine_data_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, \"_routine.csv\")), row.names = FALSE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6555bb86-4353-45ee-9464-daecd8833c31", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# log\n", + "log_msg(out_msg)" + ] + }, + { + "cell_type": "markdown", + "id": "07ce08f0-526e-48e9-a72d-16aafc1f40b8", + "metadata": {}, + "source": [ + "### Data Summary " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbaa160c-04e4-4b9c-95e4-b320a358ce40", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Data summary\n", + "print(summary(routine_data_formatted))" + ] } - }, - "outputs": [], - "source": [ - "# List required pcks \n", - "required_packages <- c(\"lubridate\", \"zoo\", \"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"reticulate\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1cf7e13-3be0-44fa-99ea-3e3520da1229", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "21c802bd-506b-4e60-bd06-b715a5c197ee", - "metadata": {}, - "source": [ - "### Load SNT configuration\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10580e48-ccb5-49df-933d-3cdbc480a402", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\"))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "markdown", - "id": "f9d3da29-c02b-44ca-b6ee-a00b990c480a", - "metadata": {}, - "source": [ - "**Checks for SNT mandatory configuration fields**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "590aa36d-74e0-4616-b60e-628e26201c52", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# CHECK SNT configuration \n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") #, \"ORG_UNITS_LEVELS_SELECTION\")\n", - "for (conf in snt_config_mandatory) {\n", - " print(paste(conf, \":\", config_json$SNT_CONFIG[conf]))\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " cat(msg) \n", - " stop(msg)\n", - " }\n", - "}\n", - "\n", - "# Save this country code in a variable\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" - ] - }, - { - "cell_type": "markdown", - "id": "0b8083d8-ba7f-49d9-865b-ada38c51b6b2", - "metadata": {}, - "source": [ - "### Load DHIS2 analytics data" - ] - }, - { - "cell_type": "markdown", - "id": "77af1690-79c4-4ad4-92c3-47a5dd119906", - "metadata": {}, - "source": [ - "-Load DHIS2 anlytics from latest dataset version \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "82d0d5e9-2cc5-4101-9ea6-59aafdcf5b81", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# DHIS2 Dataset extract identifier\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", - "\n", - "# Load file from dataset\n", - "dhis2_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_analytics.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 analytics file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"DHIS2 analytics data loaded from dataset : \", dataset_name, \" dataframe dimensions: \", paste(dim(dhis2_data), collapse=\", \"))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "markdown", - "id": "0d351974-c1c5-4971-97c8-f0122ca9e803", - "metadata": {}, - "source": [ - "## SNT Indicators computation" - ] - }, - { - "cell_type": "markdown", - "id": "3c8c5b6f-395b-43bf-a349-a6f059b4fe5a", - "metadata": {}, - "source": [ - "### Select dhis2 metadata " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "328ce7f1-a010-44c4-92e2-3b31829b32d8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# log\n", - "msg <- paste0(\"Computing SNT indicators.\")\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a076591f-b6df-49cb-8874-627b61a03356", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Select only metadata (reduce the size of the dataframe)\n", - "administrative_cols <- colnames(dhis2_data)[grepl(\"LEVEL_\", colnames(dhis2_data))]\n", - "dhis2_metadata <- dhis2_data[ , c(\"OU\", administrative_cols)] # Metadata\n", - "dhis2_metadata <- distinct(dhis2_metadata)\n", - "dim(dhis2_metadata)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7dbc619-d440-49f0-983b-18e9f264edaa", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Max admin columns available (matchin ou)\n", - "name_cols <- grep(\"LEVEL_\\\\d+_NAME\", administrative_cols, value = TRUE)\n", - "max_level <- max(as.numeric(gsub(\"LEVEL_(\\\\d+)_NAME\", \"\\\\1\", name_cols)))\n", - "max_admin_col_name <- paste0(\"LEVEL_\", max_level, \"_NAME\")\n", - "\n", - "# Result\n", - "print(max_admin_col_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6a94a3a-0078-4a25-af01-bbe49d55d4b4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Clean strings for admin 1 and admin 2\n", - "dhis2_metadata[[ADMIN_1]] <- format_names(dhis2_metadata[[ADMIN_1]]) # (format_names() in snt_utils.r)\n", - "dhis2_metadata[[ADMIN_2]] <- format_names(dhis2_metadata[[ADMIN_2]])" - ] - }, - { - "cell_type": "markdown", - "id": "4cabdecd-e4ba-43c4-9877-302e9854d07d", - "metadata": {}, - "source": [ - "### Select dhis2 values data " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4228556-f14e-4fbd-83b6-f99659a01b10", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# dhis2 Values table\n", - "dhis2_values <- dhis2_data[ , c(\"DX\", \"CO\", \"OU\", \"PE\", \"VALUE\")]\n", - "head(dhis2_values)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e3cca485-87a8-4914-b389-e265235461cd", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "unique(dhis2_values$DX)" - ] - }, - { - "cell_type": "markdown", - "id": "63254824-df25-4e48-a0ff-05538e7cd3f8", - "metadata": {}, - "source": [ - "### Pivot dhis2 value table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6ceabbd-5c08-406c-b8e9-bd5615008959", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# make sure we have numeric data in \"values\" column\n", - "dhis2_values$VALUE <- as.numeric(dhis2_values$VALUE)\n", - "\n", - "# pivot table on DX and CO columns (available combinations to columns)\n", - "routine_data <- pivot_wider(dhis2_values,\n", - " id_cols = all_of(c(\"OU\", \"PE\")),\n", - " names_from = c(\"DX\", \"CO\"),\n", - " values_from = 'VALUE')\n", - "\n", - "print(paste(\"Routine data pivot : \", paste0(dim(routine_data), collapse=\", \")))" - ] - }, - { - "cell_type": "markdown", - "id": "cdeeb35d-975f-4929-8d69-a53dfae9d7fa", - "metadata": {}, - "source": [ - "### Build indicator definitions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a9ffed9b-df31-4f86-bfec-4b1f27150c7d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# copy\n", - "routine_data_ind <- routine_data\n", - "\n", - "# Get list of indicator definitions from SNT configuration\n", - "dhis_indicator_definitions <- config_json$DHIS2_DATA$DHIS2_INDICATOR_DEFINITIONS\n", - "names(dhis_indicator_definitions) <- toupper(names(dhis_indicator_definitions))" - ] - }, - { - "cell_type": "markdown", - "id": "401e0b96-3b0b-4a5c-bccc-e31e251c9d78", - "metadata": {}, - "source": [ - "**Remove empty indicators from the list**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c6584bf-9f8c-47d9-86a3-7dabbe7f2839", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhis_indicator_definitions_clean <- dhis_indicator_definitions\n", - "empty_indicators <- c()\n", - "\n", - "# Loop over the indicators and clean the list\n", - "for (name in names(dhis_indicator_definitions_clean)) {\n", - " value <- dhis_indicator_definitions_clean[[name]]\n", - " \n", - " # If value is NULL or length zero, leave as is or set to NULL\n", - " if (is.null(value) || length(value) == 0 || all(value == \"\")) {\n", - " dhis_indicator_definitions_clean[[name]] <- NULL \n", - " empty_indicators <- c(empty_indicators, name)\n", - " next\n", - " } \n", - " # Trim whitespace and then check if empty string\n", - " value_trimmed <- trimws(value)\n", - " dhis_indicator_definitions_clean[[name]] <- value_trimmed \n", - "}\n", - "\n", - "print(\"Complete indicator definitions:\")\n", - "print(names(dhis_indicator_definitions_clean))\n", - "print(\"Empty indicator definitions: \")\n", - "print(empty_indicators)" - ] - }, - { - "cell_type": "markdown", - "id": "ad3f0f9c-b67b-4f0e-8c94-356ae0949165", - "metadata": {}, - "source": [ - "**Start indicators loop**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ccd7f24-92d7-4ffa-b502-12fd0a4678b0", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# loop over the definitions\n", - "empty_data_indicators <- c()\n", - "for (indicator in names(dhis_indicator_definitions_clean)) {\n", - " \n", - " data_element_uids <- dhis_indicator_definitions_clean[[indicator]] \n", - " col_names <- c()\n", - "\n", - " if (length(data_element_uids) > 0) {\n", - " for (dx in data_element_uids) {\n", - " dx_co <- gsub(\"\\\\.\", \"_\", dx) \n", - " if (grepl(\"_\", dx_co)) {\n", - " col_names <- c(col_names , dx_co)\n", - " } else {\n", - " if (!any(grepl(dx, colnames(routine_data_ind)))) { # is there no dx what match?\n", - " msg <- paste0(\"Data element : \" , dx, \" of indicator \", indicator , \" is missing in the DHIS2 routine data.\")\n", - " log_msg(msg, level=\"warning\")\n", - " } else {\n", - " col_names <- c(col_names , colnames(routine_data_ind)[grepl(dx, colnames(routine_data_ind))])\n", - " } \n", - " }\n", - " }\n", - " \n", - " # check if there are matching data elements\n", - " if (length(col_names) == 0) {\n", - " msg <- paste0(\"No data elements available to build indicator : \" , indicator, \", skipped.\")\n", - " log_msg(msg, level=\"warning\")\n", - " empty_data_indicators <- c(empty_data_indicators, indicator)\n", - " next\n", - " }\n", - " \n", - " # logs\n", - " msg <- paste0(\"Building indicator : \", indicator, \" -> column selection : \", paste(col_names, collapse = \", \")) \n", - " log_msg(msg)\n", - " \n", - " if (length(col_names) > 1) {\n", - " sums <- rowSums(routine_data_ind[, col_names], na.rm = TRUE)\n", - " all_na <- rowSums(!is.na(routine_data_ind[, col_names])) == 0\n", - " sums[all_na] <- NA # Keep NA if all rows are NA!\n", - " routine_data_ind[[indicator]] <- sums \n", - " } else {\n", - " routine_data_ind[indicator] <- routine_data_ind[, col_names] \n", - " }\n", - " \n", - " } else {\n", - " routine_data_ind[indicator] <- NA\n", - " \n", - " # logs\n", - " msg <- paste0(\"Building indicator : \", indicator, \" -> column selection : NULL\")\n", - " log_msg(msg)\n", - " }\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51c72216-4531-4bce-8d24-15f76b5488f3", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Add the empty indicator columns (if not needed this can be commented)\n", - "for (empty_indicator in empty_indicators) {\n", - " routine_data_ind[empty_indicator] <- NA\n", - " \n", - " # logs\n", - " msg <- paste0(\"Building indicator : \", empty_indicator, \" -> column selection : NULL\")\n", - " log_msg(msg)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cee9752-b5e7-41e3-984b-2008206ffd62", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "print(dim(routine_data_ind))\n", - "head(routine_data_ind, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "e7b19575-978e-4507-b486-58b1be866912", - "metadata": {}, - "source": [ - "## Format SNT routine data" - ] - }, - { - "cell_type": "markdown", - "id": "c80c3ef3-c7a6-4845-9134-3900bfba5eef", - "metadata": {}, - "source": [ - "### SNT format " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7073fec-5ac9-4553-894b-bb04e4e351b0", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Filter routine data columns by indicators\n", - "built_indicators <- names(dhis_indicator_definitions)[!(names(dhis_indicator_definitions) %in% empty_data_indicators)]\n", - "routine_data_selection <- routine_data_ind[, c(\"OU\", \"PE\", built_indicators)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3dbebbdd-6352-45cd-b64d-3d7253c5dbe0", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# left join with metadata\n", - "routine_data_merged <- merge(routine_data_selection, dhis2_metadata, by = \"OU\", all.x = TRUE)\n", - "\n", - "# Select administrative columns\n", - "adm_1_id_col <- gsub(\"_NAME\", \"_ID\", ADMIN_1)\n", - "adm_1_name_col <- ADMIN_1\n", - "adm_2_id_col <- gsub(\"_NAME\", \"_ID\", ADMIN_2)\n", - "adm_2_name_col <- ADMIN_2\n", - "\n", - "# Select and Rename\n", - "routine_data_formatted <- routine_data_merged %>%\n", - " mutate( \n", - " YEAR = as.numeric(substr(PE, 1, 4)),\n", - " MONTH = as.numeric(substr(PE, 5, 6)),\n", - " PE = as.numeric(PE)\n", - " ) %>%\n", - " select(\n", - " PERIOD = PE,\n", - " YEAR,\n", - " MONTH,\n", - " OU_ID = OU,\n", - " OU_NAME = !!sym(max_admin_col_name),\n", - " ADM1_NAME = !!sym(adm_1_name_col),\n", - " ADM1_ID = !!sym(adm_1_id_col),\n", - " ADM2_NAME = !!sym(adm_2_name_col),\n", - " ADM2_ID = !!sym(adm_2_id_col),\n", - " all_of(built_indicators)\n", - " )\n", - "\n", - "# Column names to upper case\n", - "colnames(routine_data_formatted) <- clean_column_names(routine_data_formatted)\n", - "\n", - "# Sort dataframe by period\n", - "routine_data_formatted <- routine_data_formatted[order(as.numeric(routine_data_formatted$PERIOD)), ]\n", - "print(dim(routine_data_formatted))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4cd73417-e12c-4ac3-bf74-b4773aeb61e6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Add the empty data indicator columns (if not needed this can be commented)\n", - "for (empty_data_indicator in empty_data_indicators) {\n", - " routine_data_formatted[empty_data_indicator] <- NA \n", - " # logs\n", - " print(paste0(\"Set indicator \", empty_data_indicator, \" to : NULL\"))\n", - " # log_msg(msg)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f36fc1ee-c2e0-4cda-ad6e-a14bfb54896d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(routine_data_formatted,3)" - ] - }, - { - "cell_type": "markdown", - "id": "853b9f8c-01a0-41de-9b6b-3a506f1a36e4", - "metadata": {}, - "source": [ - "### Output formatted routine data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d704604-8af9-48bf-afc4-3965a634ac75", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "out_msg <- paste0(\"Rountine data saved under: \", file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, \"_routine.parquet\")))\n", - "\n", - "# write parquet file\n", - "write_parquet(routine_data_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, \"_routine.parquet\")))\n", - "\n", - "# write csv file\n", - "write.csv(routine_data_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, \"_routine.csv\")), row.names = FALSE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6555bb86-4353-45ee-9464-daecd8833c31", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# log\n", - "log_msg(out_msg)" - ] - }, - { - "cell_type": "markdown", - "id": "07ce08f0-526e-48e9-a72d-16aafc1f40b8", - "metadata": {}, - "source": [ - "### Data Summary " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cbaa160c-04e4-4b9c-95e4-b320a358ce40", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "# Data summary\n", - "print(summary(routine_data_formatted))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb index e4d5980..b49d1a7 100644 --- a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb @@ -2,11 +2,14 @@ "cells": [ { "cell_type": "code", + "execution_count": null, + "id": "47551f88-b40b-449f-9dc1-59db71183611", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# 💡 Comments / Questions & To Do's:\n", "# - filter by YEAR keep only 2022-2024): \n", @@ -16,18 +19,18 @@ "# - OUTLIERS: there are clear outliers (i.e., DS AGADEZ): shall we do some simple data cleaning here?\n", "# - Population catagories (breaks) do we have a specific scale in mind \n", "# (i.e., use same as another country) or can I set it based on the data" - ], - "execution_count": null, - "outputs": [], - "id": "47551f88-b40b-449f-9dc1-59db71183611" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "342b6b54-4812-4b07-b408-68a034b4014e", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# TO DO / FINISH:\n", "# - add safety \"if\" logic so nb does not fail if data is missing or wrong path ...\n", @@ -35,26 +38,26 @@ "# - Add code to export PNG files of relevant figures\n", "# - Set dynamic boundaries for POPULATION categories? (so can use same code in different countries)\n", "# - Clean code to avoid redundancies (especially ggplot stuff, a lot of copy pasted ...)" - ], - "execution_count": null, - "outputs": [], - "id": "342b6b54-4812-4b07-b408-68a034b4014e" + ] }, { "cell_type": "markdown", + "id": "5b72f828-4fc1-462d-babc-f8f6c9c96ff5", "metadata": {}, "source": [ "## 0. Paths and Config" - ], - "id": "5b72f828-4fc1-462d-babc-f8f6c9c96ff5" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "7d3285c7-1a60-46ad-9541-36a703d51924", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Set SNT Paths\n", "SNT_ROOT_PATH <- \"~/workspace\"\n", @@ -70,34 +73,34 @@ " dir.create(figures_dir, recursive = TRUE)\n", " print(paste0(\"Created figures directory: \", figures_dir))\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "7d3285c7-1a60-46ad-9541-36a703d51924" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "732733e7-8890-4c3e-be64-496fd4a2c800", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Load util functions\n", "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_formatting_report.r\"))" - ], - "execution_count": null, - "outputs": [], - "id": "732733e7-8890-4c3e-be64-496fd4a2c800" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "3f26728d-10a0-42d6-a7ff-368cc38e60b9", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "required_packages <- c(\n", " \"tidyverse\", \n", @@ -109,18 +112,18 @@ "\n", "# Execute function\n", "install_and_load(required_packages)" - ], - "execution_count": null, - "outputs": [], - "id": "3f26728d-10a0-42d6-a7ff-368cc38e60b9" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "20475dd9-5091-4f87-9ae2-d0235921fe94", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Set environment to load openhexa.sdk from the right environment\n", "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", @@ -130,18 +133,18 @@ "# Load openhexa.sdk\n", "reticulate::py_config()$python\n", "openhexa <- import(\"openhexa.sdk\")" - ], - "execution_count": null, - "outputs": [], - "id": "20475dd9-5091-4f87-9ae2-d0235921fe94" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9f70d726-1c34-47dc-b963-bb23e42994bb", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Load SNT config\n", "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", @@ -150,70 +153,70 @@ " cat(msg) \n", " stop(msg) \n", " })" - ], - "execution_count": null, - "outputs": [], - "id": "9f70d726-1c34-47dc-b963-bb23e42994bb" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "90d58c60-fb4e-40e4-add8-5f258f541843", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Configuration variables\n", "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" - ], - "execution_count": null, - "outputs": [], - "id": "90d58c60-fb4e-40e4-add8-5f258f541843" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4b96fa16-25cc-4420-9ad8-332af4a59fdf", "metadata": { "vscode": { "languageId": "r" } }, - "source": [], - "execution_count": null, "outputs": [], - "id": "4b96fa16-25cc-4420-9ad8-332af4a59fdf" + "source": [] }, { "cell_type": "code", + "execution_count": null, + "id": "8eece9e0-2544-48c1-8579-a5a721af4ff8", "metadata": { "vscode": { "languageId": "r" } }, - "source": [ - "# printdim() loaded from code/snt_utils.r" - ], - "execution_count": null, "outputs": [], - "id": "8eece9e0-2544-48c1-8579-a5a721af4ff8" + "source": [ + "# printdim() loaded from pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r" + ] }, { "cell_type": "markdown", + "id": "643abe28-da3b-4bd2-9ecc-126b18b85c69", "metadata": {}, "source": [ "## 1. Import data" - ], - "id": "643abe28-da3b-4bd2-9ecc-126b18b85c69" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "43bbbcdf-c1d1-4631-980c-2c4465cf7a55", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# import analytics DHIS2 data\n", "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", @@ -224,18 +227,18 @@ " })\n", "\n", "printdim(routine_data)" - ], - "execution_count": null, - "outputs": [], - "id": "43bbbcdf-c1d1-4631-980c-2c4465cf7a55" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "d53274c5-965e-4a11-bb77-c9b899d5cb9c", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", " error = function(e) {\n", @@ -245,18 +248,18 @@ " })\n", "\n", "printdim(population_data)" - ], - "execution_count": null, - "outputs": [], - "id": "d53274c5-965e-4a11-bb77-c9b899d5cb9c" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c1be5372-cbc1-4343-ab11-01eae0fa9d60", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", " error = function(e) { \n", @@ -266,74 +269,74 @@ " })\n", "\n", "printdim(shapes_data)" - ], - "execution_count": null, - "outputs": [], - "id": "c1be5372-cbc1-4343-ab11-01eae0fa9d60" + ] }, { "cell_type": "markdown", + "id": "c881f748-e391-46c9-a36a-ed11c238a6ce", "metadata": {}, - "source": [], - "id": "c881f748-e391-46c9-a36a-ed11c238a6ce" + "source": [] }, { "cell_type": "code", + "execution_count": null, + "id": "65ea60f5-99e9-46d1-89f0-03245d9efd0b", "metadata": { "vscode": { "languageId": "r" } }, - "source": [], - "execution_count": null, "outputs": [], - "id": "65ea60f5-99e9-46d1-89f0-03245d9efd0b" + "source": [] }, { "cell_type": "markdown", + "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", "metadata": {}, "source": [ "# **Complétude des indicateurs composites**\n" - ], - "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233" + ] }, { "cell_type": "markdown", + "id": "ca84fce1-0407-433a-a98a-e65ed15ab8de", "metadata": {}, "source": [ "# 1. Complétude du rapportage des indicateurs composites / Reporting Completeness of Composite Indicators" - ], - "id": "ca84fce1-0407-433a-a98a-e65ed15ab8de" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c7691e61-6542-4d40-af2a-c018d29b86a8", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "head(routine_data)" - ], - "execution_count": null, - "outputs": [], - "id": "c7691e61-6542-4d40-af2a-c018d29b86a8" + ] }, { "cell_type": "markdown", + "id": "c109e82d-8c72-41f0-857a-322163cf213e", "metadata": {}, "source": [ "## 1.1 Proportion de formations sanitaires ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur" - ], - "id": "c109e82d-8c72-41f0-857a-322163cf213e" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "0f54505a-2dcc-429e-a900-46d4fae6fd31", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Step 0: Rename your data for convenience\n", "data <- routine_data\n", @@ -402,18 +405,18 @@ " pct_positive = \"Valeur positive rapportée\")\n", " ) %>%\n", " complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n" - ], - "execution_count": null, - "outputs": [], - "id": "0f54505a-2dcc-429e-a900-46d4fae6fd31" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "cfd115e7-176d-4beb-9ab9-2e6990cb16af", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "options(repr.plot.width = 17, repr.plot.height = 10)\n", "ggplot(plot_data, aes(x = DATE, y = Percentage, fill = Status)) +\n", @@ -438,26 +441,26 @@ " axis.title = element_text(size = 16),\n", " axis.text = element_text(size = 16)\n", " )\n" - ], - "execution_count": null, - "outputs": [], - "id": "cfd115e7-176d-4beb-9ab9-2e6990cb16af" + ] }, { "cell_type": "markdown", + "id": "e6871759-714b-437a-8b9c-5a5a06656567", "metadata": {}, "source": [ "## 1.2 Proportion des districts ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur." - ], - "id": "e6871759-714b-437a-8b9c-5a5a06656567" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c89f6c77-dd42-4616-8eb5-1642d5b51157", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Step 0: Rename for convenience\n", "data <- routine_data\n", @@ -552,50 +555,50 @@ " axis.title = element_text(size = 14),\n", " axis.text = element_text(size = 12)\n", " )\n" - ], - "execution_count": null, - "outputs": [], - "id": "c89f6c77-dd42-4616-8eb5-1642d5b51157" + ] }, { "cell_type": "markdown", + "id": "5cda3985", "metadata": {}, "source": [ "# 2. Cohérence interne des indicateurs composites" - ], - "id": "5cda3985" + ] }, { "cell_type": "markdown", + "id": "c131a633", "metadata": {}, "source": [ "## 2.1 Filtrage préliminaire des valeurs aberrantes pour l’analyse de cohérence\n", "\n", "Avant d’évaluer la cohérence entre les indicateurs composites, nous éliminons d’abord les valeurs aberrantes les plus extrêmes. Cette étape ne modifie pas définitivement le jeu de données et ne vise pas à détecter toutes les valeurs aberrantes ; elle permet simplement d’exclure les cas extrêmes afin de faciliter une évaluation plus fiable de la cohérence entre les indicateurs." - ], - "id": "c131a633" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "936268f4", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# detect_mad_outliers() loaded from utils/snt_dhis2_formatting_report.r" - ], - "execution_count": null, - "outputs": [], - "id": "936268f4" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "881f9625", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Step 0: Select relevant core indicators\n", "target_indicators <- c(\"SUSP\", \"TEST\", \"CONF\", \"MALTREAT\", \"PRES\")\n", @@ -634,18 +637,18 @@ " mad10_flags %>% select(PERIOD, OU, indicator, mad10),\n", " by = c(\"PERIOD\", \"OU\", \"indicator\")\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "881f9625" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "04d41ed1", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Step 6: Identify outliers (MAD15 or MAD10 flagged as TRUE)\n", "outlier_flags <- mad_combined %>%\n", @@ -670,26 +673,26 @@ "routine_data_clean <- routine_long_clean %>%\n", " select(-OU) %>%\n", " pivot_wider(names_from = indicator, values_from = value)\n" - ], - "execution_count": null, - "outputs": [], - "id": "04d41ed1" + ] }, { "cell_type": "markdown", + "id": "c6a5a77b", "metadata": {}, "source": [ "## 2.2 Cohérence des indicateurs" - ], - "id": "c6a5a77b" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "6cfeb18e", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Step 1: Extract year and month from PERIOD\n", "routine_hd_month <- routine_data_clean %>%\n", @@ -730,18 +733,18 @@ "\n", "# Step 3: Combine plots\n", "(p1 | p2 | p3) + plot_layout(guides = \"collect\")\n" - ], - "execution_count": null, - "outputs": [], - "id": "6cfeb18e" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "0df24272", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Step 1: Aggregate monthly values\n", "rds_clean_month <- routine_data_clean %>%\n", @@ -777,34 +780,34 @@ " legend.title = element_text(size = 16),\n", " legend.text = element_text(size = 16)\n", " )\n" - ], - "execution_count": null, - "outputs": [], - "id": "0df24272" + ] }, { "cell_type": "markdown", + "id": "780fc9f8-6c67-4328-85f1-6bdefcd15b48", "metadata": {}, "source": [ "# 3. Carte des populations par district sanitaire (DS)" - ], - "id": "780fc9f8-6c67-4328-85f1-6bdefcd15b48" + ] }, { "cell_type": "markdown", + "id": "da58bbd3", "metadata": {}, "source": [ "## 3.1. Carte de la Population pour ADM2 " - ], - "id": "da58bbd3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "6965155d", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Code from previous version of the notebook\n", "# Uses continuos scale for population\n", @@ -831,27 +834,27 @@ "} else {\n", " print(\"Population or shapes data not available.\")\n", "}\n" - ], - "execution_count": null, - "outputs": [], - "id": "6965155d" + ] }, { "cell_type": "markdown", + "id": "eb276692", "metadata": {}, "source": [ "## ⚠️ 3.2. Carte de la Population Désagrégée (spécifique au pays)\n", "Le code suivant est spécifique à chaque pays et repose sur une population désagrégée. " - ], - "id": "eb276692" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4d33724e", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "population_data_filtered <- population_data\n", "if (COUNTRY_CODE == \"NER\") {\n", @@ -889,18 +892,18 @@ " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", "\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "4d33724e" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "0fdb96a0-873d-4f85-9c34-23c89c204c30", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "NER_palette_population <- c(\n", " \"1\" = \"#fae6db\",\n", @@ -909,26 +912,26 @@ " \"4\" = \"#cc3f32\",\n", " \"5\" = \"#972620\"\n", ")\n" - ], - "execution_count": null, - "outputs": [], - "id": "0fdb96a0-873d-4f85-9c34-23c89c204c30" + ] }, { "cell_type": "markdown", + "id": "95892df7-e5b8-4d7a-bf96-88673e633370", "metadata": {}, "source": [ "### Population Totales" - ], - "id": "95892df7-e5b8-4d7a-bf96-88673e633370" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "a0a196b8-2db5-478d-899a-48985d1735f0", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "if (COUNTRY_CODE == \"NER\") {\n", "\n", @@ -985,26 +988,26 @@ " dpi = 300\n", ")\n", "}\n" - ], - "execution_count": null, - "outputs": [], - "id": "a0a196b8-2db5-478d-899a-48985d1735f0" + ] }, { "cell_type": "markdown", + "id": "aca477aa-4d93-4a74-ad8c-32a30f85a552", "metadata": {}, "source": [ "### Population Femmes Enceintes (FE)" - ], - "id": "aca477aa-4d93-4a74-ad8c-32a30f85a552" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9324a56b", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "if (COUNTRY_CODE == \"NER\") {\n", "\n", @@ -1061,26 +1064,26 @@ ")\n", "\n", "}\n" - ], - "execution_count": null, - "outputs": [], - "id": "9324a56b" + ] }, { "cell_type": "markdown", + "id": "bd5fe86d-591a-4f5a-bc42-58180a413d5d", "metadata": {}, "source": [ "### Population Enfants moins de 5 ans (U5)" - ], - "id": "bd5fe86d-591a-4f5a-bc42-58180a413d5d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4046761f", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "if (COUNTRY_CODE == \"NER\") {\n", "\n", @@ -1137,49 +1140,49 @@ ")\n", "\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "4046761f" + ] }, { "cell_type": "markdown", + "id": "61e5ac12-c973-48e0-8c97-1af90e4b59a5", "metadata": {}, "source": [ "## 3.2. Complétude et qualité des données de la Population" - ], - "id": "61e5ac12-c973-48e0-8c97-1af90e4b59a5" + ] }, { "cell_type": "markdown", + "id": "0d86ed4a-e194-496b-9440-ad206157ee17", "metadata": {}, "source": [ "#### Population Totale" - ], - "id": "0d86ed4a-e194-496b-9440-ad206157ee17" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "bec2759d-9ac4-42e1-9f7e-7076780bd7d6", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# hist(population_data$POPULATION)\n", "hist(population_data_filtered$POPULATION)" - ], - "execution_count": null, - "outputs": [], - "id": "bec2759d-9ac4-42e1-9f7e-7076780bd7d6" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "bc00527f-d8f9-4c9e-bf4a-326c92cf8a68", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "ggplot(population_data_filtered) +\n", " geom_point(aes(x = POPULATION,\n", @@ -1212,43 +1215,43 @@ " height = 23,\n", " bg = \"white\"\n", ")" - ], - "execution_count": null, - "outputs": [], - "id": "bc00527f-d8f9-4c9e-bf4a-326c92cf8a68" + ] }, { "cell_type": "markdown", + "id": "d6ab387a-cc9e-42b9-a634-12af21bef0f5", "metadata": {}, "source": [ "#### Population Femmes Enceintes (FE)" - ], - "id": "d6ab387a-cc9e-42b9-a634-12af21bef0f5" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c6bb79dd-2d8a-4cd1-bf91-3e0e48c14eda", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Wrap in if statement to avoid errors if POPULATION_FE is missing\n", "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", " hist(population_data_filtered$POPULATION_FE)\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "c6bb79dd-2d8a-4cd1-bf91-3e0e48c14eda" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4200afa2-e2f0-4876-9842-141b96f32fe8", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", " \n", @@ -1278,42 +1281,42 @@ " )\n", "\n", "} " - ], - "execution_count": null, - "outputs": [], - "id": "4200afa2-e2f0-4876-9842-141b96f32fe8" + ] }, { "cell_type": "markdown", + "id": "e39305c0-3700-48c3-967a-b9c6af3e737f", "metadata": {}, "source": [ "#### Population Enfants moins de 5 ans (U5)" - ], - "id": "e39305c0-3700-48c3-967a-b9c6af3e737f" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "bbda9b88-9b91-4845-83a8-795a12124999", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", " hist(population_data_filtered$POPULATION_U5)\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "bbda9b88-9b91-4845-83a8-795a12124999" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "742116ab-fef7-46ea-8c4b-0aa2a166005d", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", "\n", @@ -1343,10 +1346,7 @@ " )\n", "\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "742116ab-fef7-46ea-8c4b-0aa2a166005d" + ] } ], "metadata": { @@ -1366,4 +1366,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r new file mode 100644 index 0000000..26b29e8 --- /dev/null +++ b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r @@ -0,0 +1,54 @@ +# Shared helpers for snt_dhis2_formatting code notebooks. + +build_routine_indicators <- function(routine_data_ind, dhis_indicator_definitions_clean) { + empty_data_indicators <- c() + + for (indicator in names(dhis_indicator_definitions_clean)) { + data_element_uids <- dhis_indicator_definitions_clean[[indicator]] + col_names <- c() + + if (length(data_element_uids) > 0) { + for (dx in data_element_uids) { + dx_co <- gsub("\\.", "_", dx) + if (grepl("_", dx_co)) { + col_names <- c(col_names, dx_co) + } else { + if (!any(grepl(dx, colnames(routine_data_ind)))) { + msg <- paste0("Data element : ", dx, " of indicator ", indicator, " is missing in the DHIS2 routine data.") + log_msg(msg, level = "warning") + } else { + col_names <- c(col_names, colnames(routine_data_ind)[grepl(dx, colnames(routine_data_ind))]) + } + } + } + + if (length(col_names) == 0) { + msg <- paste0("No data elements available to build indicator : ", indicator, ", skipped.") + log_msg(msg, level = "warning") + empty_data_indicators <- c(empty_data_indicators, indicator) + next + } + + msg <- paste0("Building indicator : ", indicator, " -> column selection : ", paste(col_names, collapse = ", ")) + log_msg(msg) + + if (length(col_names) > 1) { + sums <- rowSums(routine_data_ind[, col_names], na.rm = TRUE) + all_na <- rowSums(!is.na(routine_data_ind[, col_names])) == 0 + sums[all_na] <- NA + routine_data_ind[[indicator]] <- sums + } else { + routine_data_ind[indicator] <- routine_data_ind[, col_names] + } + } else { + routine_data_ind[indicator] <- NA + msg <- paste0("Building indicator : ", indicator, " -> column selection : NULL") + log_msg(msg) + } + } + + list( + routine_data_ind = routine_data_ind, + empty_data_indicators = empty_data_indicators + ) +} diff --git a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r index eb34c7e..ecfabfe 100644 --- a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r +++ b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r @@ -1,5 +1,9 @@ # Shared helpers for snt_dhis2_formatting reporting notebook. +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + detect_mad_outliers <- function(data_long, deviation = 15, outlier_column = "mad_flag") { data_long %>% dplyr::group_by(OU, indicator, YEAR) %>% diff --git a/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb b/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb index 1036c5f..c0cb6eb 100644 --- a/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb +++ b/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb @@ -137,6 +137,7 @@ "source": [ "# PROJECT PATHS\n", "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_incidence\")\n", "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2', 'incidence') # store the output of the pipeline (only final results)\n", @@ -144,6 +145,7 @@ "\n", "source(file.path(CODE_PATH, \"snt_utils.r\")) # utils\n", "source(file.path(CODE_PATH, \"snt_palettes.r\")) # palettes \n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_incidence.r\"))\n", "\n", "# List required pcks\n", "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", @@ -237,14 +239,7 @@ }, "outputs": [], "source": [ - "# helper function \n", - "resolve_routine_filename <- function(routine_choice) { \n", - " if (routine_choice == \"raw\") return(\"_routine.parquet\")\n", - " is_removed <- FALSE\n", - " if (routine_choice == \"raw_without_outliers\") is_removed <- TRUE \n", - " removed_status <- if (is_removed) \"removed\" else \"imputed\" \n", - " return(glue::glue(\"_routine_outliers_{removed_status}.parquet\"))\n", - "} " + "# resolve_routine_filename() loaded from utils/snt_dhis2_incidence.r" ] }, { @@ -298,30 +293,6 @@ "outputs": [], "source": [ "# Warn when incidence routine choice differs from latest reporting-rate routine choice\n", - "infer_reporting_routine_choice <- function(reporting_parameters) {\n", - " if (is.null(reporting_parameters)) {\n", - " return(NULL)\n", - " }\n", - " if (length(names(reporting_parameters)) == 0) {\n", - " return(NULL)\n", - " }\n", - "\n", - " # New reporting-rate pipelines store ROUTINE_FILE in parameters.json\n", - " if (\"ROUTINE_FILE\" %in% names(reporting_parameters)) {\n", - " routine_file <- as.character(reporting_parameters$ROUTINE_FILE[[1]])\n", - " if (grepl(\"_routine_outliers_removed\\\\.parquet$\", routine_file)) return(\"raw_without_outliers\")\n", - " if (grepl(\"_routine_outliers_imputed\\\\.parquet$\", routine_file)) return(\"imputed\")\n", - " if (grepl(\"_routine\\\\.parquet$\", routine_file)) return(\"raw\")\n", - " }\n", - "\n", - " # Legacy reporting-rate pipeline uses formatted routine data (raw)\n", - " if (\"REPORTING_RATE_METHOD\" %in% names(reporting_parameters)) {\n", - " return(\"raw\")\n", - " }\n", - "\n", - " return(NULL)\n", - "}\n", - "\n", "rr_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", "rr_parameters <- tryCatch(\n", " {\n", @@ -1628,27 +1599,7 @@ }, "outputs": [], "source": [ - "# Reusable function to generate filename and save data\n", - "save_yearly_incidence <- function(yearly_incidence, data_path, file_extension, write_function) {\n", - " \n", - " base_name_parts <- c(COUNTRY_CODE, \"_incidence\")\n", - " \n", - " # Concatenate all parts to form the final filename\n", - " file_name <- paste0(c(base_name_parts, file_extension), collapse = \"\")\n", - " # file_path <- file.path(data_path, \"incidence\", file_name)\n", - " file_path <- file.path(data_path, file_name)\n", - " output_dir <- dirname(file_path)\n", - "\n", - " # Check if the output directory exists, else create it\n", - " if (!dir.exists(output_dir)) {\n", - " dir.create(output_dir, recursive = TRUE)\n", - " }\n", - "\n", - " # Flexibility to use function as provided in argument: \"write_csv\" or \"arrow::write_parquet\" ... \n", - " write_function(yearly_incidence, file_path)\n", - "\n", - " log_msg(paste0(\"Exporting : \", file_path))\n", - "}" + "# save_yearly_incidence() loaded from utils/snt_dhis2_incidence.r" ] }, { @@ -1693,10 +1644,10 @@ "# Export the data\n", "\n", "# CSV\n", - "save_yearly_incidence(yearly_incidence, DATA_PATH, \".csv\", write_csv)\n", + "save_yearly_incidence(yearly_incidence, DATA_PATH, \".csv\", write_csv, COUNTRY_CODE)\n", "\n", "# Parquet\n", - "save_yearly_incidence(yearly_incidence, DATA_PATH, \".parquet\", arrow::write_parquet)" + "save_yearly_incidence(yearly_incidence, DATA_PATH, \".parquet\", arrow::write_parquet, COUNTRY_CODE)" ] } ], diff --git a/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r b/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r new file mode 100644 index 0000000..7242035 --- /dev/null +++ b/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r @@ -0,0 +1,44 @@ +# Shared helpers for snt_dhis2_incidence notebooks. + +resolve_routine_filename <- function(routine_choice) { + if (routine_choice == "raw") return("_routine.parquet") + is_removed <- FALSE + if (routine_choice == "raw_without_outliers") is_removed <- TRUE + removed_status <- if (is_removed) "removed" else "imputed" + return(glue::glue("_routine_outliers_{removed_status}.parquet")) +} + +infer_reporting_routine_choice <- function(reporting_parameters) { + if (is.null(reporting_parameters)) return(NULL) + if (length(names(reporting_parameters)) == 0) return(NULL) + + if ("ROUTINE_FILE" %in% names(reporting_parameters)) { + routine_file <- as.character(reporting_parameters$ROUTINE_FILE[[1]]) + if (grepl("_routine_outliers_removed\\.parquet$", routine_file)) return("raw_without_outliers") + if (grepl("_routine_outliers_imputed\\.parquet$", routine_file)) return("imputed") + if (grepl("_routine\\.parquet$", routine_file)) return("raw") + } + + if ("REPORTING_RATE_METHOD" %in% names(reporting_parameters)) return("raw") + return(NULL) +} + +save_yearly_incidence <- function(yearly_incidence, data_path, file_extension, write_function, country_code = NULL) { + if (is.null(country_code) && exists("COUNTRY_CODE")) { + country_code <- get("COUNTRY_CODE") + } + if (is.null(country_code)) { + stop("country_code is required to export yearly incidence.") + } + + file_name <- paste0(country_code, "_incidence", file_extension) + file_path <- file.path(data_path, file_name) + output_dir <- dirname(file_path) + + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) + } + + write_function(yearly_incidence, file_path) + log_msg(paste0("Exporting : ", file_path)) +} diff --git a/pipelines/snt_dhis2_population_transformation/reporting/snt_dhis2_population_transformation_report.ipynb b/pipelines/snt_dhis2_population_transformation/reporting/snt_dhis2_population_transformation_report.ipynb index e3a08de..5c21a62 100644 --- a/pipelines/snt_dhis2_population_transformation/reporting/snt_dhis2_population_transformation_report.ipynb +++ b/pipelines/snt_dhis2_population_transformation/reporting/snt_dhis2_population_transformation_report.ipynb @@ -1,634 +1,611 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "0e110b98", - "metadata": {}, - "source": [ - "## 0. Paths and Config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a12ad46", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "\n", - "REPORTING_NB_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_population_transformation/reporting\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a57ff983", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "686cf285", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "required_packages <- c(\n", - " \"tidyverse\", \n", - " \"arrow\", \n", - " \"sf\", \n", - " \"reticulate\"\n", - " # \"patchwork\"\n", - ") \n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c8aaaa55", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load openhexa.sdk\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8365f8e3", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "609760f7", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration variables\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_POPULATION_TRANSFORMATION\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", - "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bda5a5b5", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - " # --- Read data from SNT_metadata.json ---\n", - " metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })" - ] - }, - { - "cell_type": "markdown", - "id": "212ba5c5", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "### Helper function(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36100ef8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "32e0ac34", - "metadata": {}, - "source": [ - "## 1. Import data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1e62ec8", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "0e110b98", + "metadata": {}, + "source": [ + "## 0. Paths and Config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a12ad46", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_population_transformation\")\n", + "\n", + "REPORTING_NB_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_population_transformation/reporting\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a57ff983", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_population_transformation_report.r\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "686cf285", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "required_packages <- c(\n", + " \"tidyverse\", \n", + " \"arrow\", \n", + " \"sf\", \n", + " \"reticulate\"\n", + " # \"patchwork\"\n", + ") \n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8aaaa55", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load openhexa.sdk\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8365f8e3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "609760f7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration variables\n", + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_POPULATION_TRANSFORMATION\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", + "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bda5a5b5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + " # --- Read data from SNT_metadata.json ---\n", + " metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading metadata\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })" + ] + }, + { + "cell_type": "markdown", + "id": "212ba5c5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "### Helper function(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36100ef8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# printdim() loaded from utils/snt_dhis2_population_transformation_report.r" + ] + }, + { + "cell_type": "markdown", + "id": "32e0ac34", + "metadata": {}, + "source": [ + "## 1. Import data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1e62ec8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste0(COUNTRY_NAME , \" Population data is not available in dataset : \" , dataset_name, \" last version.\")\n", + " log_msg(msg, \"warning\")\n", + " population_data <- NULL\n", + " })\n", + "\n", + "printdim(population_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d54eec5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "shapes_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(shapes_dataset_name, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) { \n", + " msg <- paste0(COUNTRY_NAME , \" Shapes data is not available in dataset : \" , shapes_dataset_name, \" last version.\")\n", + " log_msg(msg, \"warning\")\n", + " shapes_data <- NULL\n", + " })\n", + "\n", + "printdim(shapes_data)" + ] + }, + { + "cell_type": "markdown", + "id": "04ee34ba", + "metadata": {}, + "source": [ + "# 3. Carte des populations par district sanitaire (DS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "449bb1e6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Code from previous version of the notebook\n", + "# Uses continuos scale for population\n", + "\n", + "# Run if population_data is available\n", + "if (!is.null(population_data) & !is.null(shapes_data)) {\n", + " # Join population to spatial shapes\n", + " map_data <- shapes_data %>%\n", + " left_join(population_data, by = \"ADM2_ID\")\n", + " \n", + " # Plot population per district (DS)\n", + " plot <- ggplot(map_data) +\n", + " geom_sf(aes(fill = POPULATION), color = \"white\", size = 0.2) +\n", + " scale_fill_viridis_c(option = \"C\", name = \"Population\") +\n", + " labs(\n", + " title = \"Population totale par district sanitaire (DS)\",\n", + " subtitle = \"Données DHIS2\",\n", + " caption = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " theme_minimal(base_size = 14) \n", + "\n", + " print(plot)\n", + "\n", + "} else {\n", + " print(\"Population or shapes data not available.\")\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "7b84e627", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "id": "6c596384", + "metadata": {}, + "source": [ + "## ⚠️ 3.2. Carte de la Population Désagrégée (spécifique au pays)\n", + "Le code suivant est spécifique à chaque pays et repose sur une population désagrégée. " + ] + }, + { + "cell_type": "markdown", + "id": "9ef7c3b2", + "metadata": {}, + "source": [ + "### 🇳🇪 NER specific code \n", + "Made ad hoc to allow comparison with data from other or previous analyses. Namely:\n", + "* only year 2022 to 2024\n", + "* specific palette (yellowish to brick red)\n", + "* specific intervals\n", + "* looks at **disaggregated** population <- this is sometimes contry-specific!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc184bb2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (COUNTRY_CODE == \"NER\") {\n", + " print(\"🇳🇪 Executing NER specific code ... \")\n", + "\n", + " # --- Filter data to keep only 2022-2024 ... ---\n", + " years_to_keep <- 2022:2024\n", + " population_data_filtered <- population_data |> filter(YEAR %in% years_to_keep)\n", + "\n", + " # --- Assign population breaks from metadata ---\n", + " value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)\n", + " value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)\n", + " value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)\n", + "\n", + " # --- Create dynamic labels based on breaks ---\n", + " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", + " labels_u5 <- create_dynamic_labels(value_breaks_u5)\n", + " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b7cc86e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "NER_palette_population <- c(\n", + " \"1\" = \"#fae6db\",\n", + " \"2\" = \"#f1b195\",\n", + " \"3\" = \"#ea7354\",\n", + " \"4\" = \"#cc3f32\",\n", + " \"5\" = \"#972620\"\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "727006f3", + "metadata": {}, + "source": [ + "### Population Totales" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "457a1280", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (COUNTRY_CODE == \"NER\") {\n", + "\n", + " # IMPORTNAT: palette vector MUST be RENAMED with the (dynamic) descriptive labels\n", + "names(NER_palette_population) <- labels_tot\n", + "\n", + "plot <- population_data_filtered %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION,\n", + " breaks = c(0, value_breaks_tot, Inf),\n", + " labels = labels_tot, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population totale par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\",\n", + " fill = \"Population Totale:\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_tot, \n", + " drop = FALSE \n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title.position = \"top\",\n", + " legend.title = element_text(face = \"bold\"),\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(20, 0, 0, 0)\n", + " )\n", + "\n", + "print(plot)\n", + "\n", + "# Export to see better in high resolution\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_transformed_total.png\")),\n", + " create.dir = TRUE,\n", + " units = \"cm\",\n", + " width = 21,\n", + " height = 15,\n", + " dpi = 300\n", + ")\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "70b6e6ca", + "metadata": {}, + "source": [ + "### Population Femmes Enceintes (FE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94fe3a11", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (COUNTRY_CODE == \"NER\") {\n", + "\n", + "names(NER_palette_population) <- labels_fe\n", + "\n", + "plot <- population_data_filtered %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION_FE,\n", + " breaks = c(0, value_breaks_fe, Inf),\n", + " labels = labels_fe, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\",\n", + " fill = \"Population Femmes Enceintes:\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_fe, \n", + " drop = FALSE # Prevents dropping empty levels from legend\n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_text(face = \"bold\"),\n", + " legend.title.position = \"top\",\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(20, 0, 0, 0)\n", + " )\n", + "\n", + "print(plot)\n", + "\n", + "# Export to see better in high resolution\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_transformed_fe.png\")),\n", + " create.dir = TRUE,\n", + " units = \"cm\",\n", + " width = 21, \n", + " height = 15,\n", + " dpi = 300\n", + ")\n", + "\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "66d761c6", + "metadata": {}, + "source": [ + "### Population Enfants moins de 5 ans (U5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c88868f2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (COUNTRY_CODE == \"NER\") {\n", + "\n", + "names(NER_palette_population) <- labels_u5\n", + "\n", + "plot <- population_data_filtered %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION_U5,\n", + " breaks = c(0, value_breaks_u5, Inf),\n", + " labels = labels_u5, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\",\n", + " fill = \"Population Enfants de moins de 5 ans:\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_u5, \n", + " drop = FALSE \n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_text(face = \"bold\"),\n", + " legend.title.position = \"top\",\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(20, 0, 0, 0)\n", + " )\n", + "\n", + "print(plot)\n", + "\n", + "# Export PNG\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_transformed_u5.png\")),\n", + " create.dir = TRUE,\n", + " units = \"cm\",\n", + " width = 21, \n", + " height = 15,\n", + " dpi = 300\n", + ")\n", + "\n", + "}" + ] } - }, - "outputs": [], - "source": [ - "population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste0(COUNTRY_NAME , \" Population data is not available in dataset : \" , dataset_name, \" last version.\")\n", - " log_msg(msg, \"warning\")\n", - " population_data <- NULL\n", - " })\n", - "\n", - "printdim(population_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d54eec5", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "shapes_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(shapes_dataset_name, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) { \n", - " msg <- paste0(COUNTRY_NAME , \" Shapes data is not available in dataset : \" , shapes_dataset_name, \" last version.\")\n", - " log_msg(msg, \"warning\")\n", - " shapes_data <- NULL\n", - " })\n", - "\n", - "printdim(shapes_data)" - ] - }, - { - "cell_type": "markdown", - "id": "04ee34ba", - "metadata": {}, - "source": [ - "# 3. Carte des populations par district sanitaire (DS)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "449bb1e6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Code from previous version of the notebook\n", - "# Uses continuos scale for population\n", - "\n", - "# Run if population_data is available\n", - "if (!is.null(population_data) & !is.null(shapes_data)) {\n", - " # Join population to spatial shapes\n", - " map_data <- shapes_data %>%\n", - " left_join(population_data, by = \"ADM2_ID\")\n", - " \n", - " # Plot population per district (DS)\n", - " plot <- ggplot(map_data) +\n", - " geom_sf(aes(fill = POPULATION), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis_c(option = \"C\", name = \"Population\") +\n", - " labs(\n", - " title = \"Population totale par district sanitaire (DS)\",\n", - " subtitle = \"Données DHIS2\",\n", - " caption = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " theme_minimal(base_size = 14) \n", - "\n", - " print(plot)\n", - "\n", - "} else {\n", - " print(\"Population or shapes data not available.\")\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "7b84e627", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "id": "6c596384", - "metadata": {}, - "source": [ - "## ⚠️ 3.2. Carte de la Population Désagrégée (spécifique au pays)\n", - "Le code suivant est spécifique à chaque pays et repose sur une population désagrégée. " - ] - }, - { - "cell_type": "markdown", - "id": "9ef7c3b2", - "metadata": {}, - "source": [ - "### 🇳🇪 NER specific code \n", - "Made ad hoc to allow comparison with data from other or previous analyses. Namely:\n", - "* only year 2022 to 2024\n", - "* specific palette (yellowish to brick red)\n", - "* specific intervals\n", - "* looks at **disaggregated** population <- this is sometimes contry-specific!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc184bb2", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - " print(\"🇳🇪 Executing NER specific code ... \")\n", - "\n", - " # --- Filter data to keep only 2022-2024 ... ---\n", - " years_to_keep <- 2022:2024\n", - " population_data_filtered <- population_data |> filter(YEAR %in% years_to_keep)\n", - "\n", - " # # --- Read data from SNT_metadata.json ---\n", - " # metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\"))},\n", - " # error = function(e) {\n", - " # msg <- paste0(\"Error while loading metadata\", conditionMessage(e)) \n", - " # cat(msg) \n", - " # stop(msg) \n", - " # })\n", - "\n", - " # --- Assign population breaks from metadata ---\n", - " value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)\n", - " value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)\n", - " value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)\n", - "\n", - " # --- Define function to create dyanic labels based on breaks for pop category ---\n", - " create_dynamic_labels <- function(breaks) {\n", - " fmt <- function(x) {\n", - " format(x / 1000, big.mark = \"'\", scientific = FALSE, trim = TRUE)\n", - " }\n", - " \n", - " labels <- c(\n", - " paste0(\"< \", fmt(breaks[1]), \"k\"), # First label\n", - " paste0(fmt(breaks[-length(breaks)]), \" - \", fmt(breaks[-1]), \"k\"), # Middle\n", - " paste0(\"> \", fmt(breaks[length(breaks)]), \"k\") # Last label\n", - " ) \n", - " return(labels)\n", - " }\n", - "\n", - " # --- Create dynamic labels based on breaks ---\n", - " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", - " labels_u5 <- create_dynamic_labels(value_breaks_u5)\n", - " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b7cc86e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "NER_palette_population <- c(\n", - " \"1\" = \"#fae6db\",\n", - " \"2\" = \"#f1b195\",\n", - " \"3\" = \"#ea7354\",\n", - " \"4\" = \"#cc3f32\",\n", - " \"5\" = \"#972620\"\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "727006f3", - "metadata": {}, - "source": [ - "### Population Totales" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "457a1280", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - " # IMPORTNAT: palette vector MUST be RENAMED with the (dynamic) descriptive labels\n", - "names(NER_palette_population) <- labels_tot\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION,\n", - " breaks = c(0, value_breaks_tot, Inf),\n", - " labels = labels_tot, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population totale par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\",\n", - " fill = \"Population Totale:\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_tot, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title.position = \"top\",\n", - " legend.title = element_text(face = \"bold\"),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(20, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export to see better in high resolution\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_transformed_total.png\")),\n", - " create.dir = TRUE,\n", - " units = \"cm\",\n", - " width = 21,\n", - " height = 15,\n", - " dpi = 300\n", - ")\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "70b6e6ca", - "metadata": {}, - "source": [ - "### Population Femmes Enceintes (FE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "94fe3a11", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - "names(NER_palette_population) <- labels_fe\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_FE,\n", - " breaks = c(0, value_breaks_fe, Inf),\n", - " labels = labels_fe, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\",\n", - " fill = \"Population Femmes Enceintes:\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_fe, \n", - " drop = FALSE # Prevents dropping empty levels from legend\n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_text(face = \"bold\"),\n", - " legend.title.position = \"top\",\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(20, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export to see better in high resolution\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_transformed_fe.png\")),\n", - " create.dir = TRUE,\n", - " units = \"cm\",\n", - " width = 21, \n", - " height = 15,\n", - " dpi = 300\n", - ")\n", - "\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "66d761c6", - "metadata": {}, - "source": [ - "### Population Enfants moins de 5 ans (U5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c88868f2", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - "names(NER_palette_population) <- labels_u5\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_U5,\n", - " breaks = c(0, value_breaks_u5, Inf),\n", - " labels = labels_u5, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\",\n", - " fill = \"Population Enfants de moins de 5 ans:\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_u5, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_text(face = \"bold\"),\n", - " legend.title.position = \"top\",\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(20, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export PNG\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_transformed_u5.png\")),\n", - " create.dir = TRUE,\n", - " units = \"cm\",\n", - " width = 21, \n", - " height = 15,\n", - " dpi = 300\n", - ")\n", - "\n", - "}" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation_report.r b/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation_report.r new file mode 100644 index 0000000..265a7b1 --- /dev/null +++ b/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation_report.r @@ -0,0 +1,18 @@ +# Shared helpers for snt_dhis2_population_transformation reporting notebook. + +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + +create_dynamic_labels <- function(breaks) { + fmt <- function(x) { + format(x / 1000, big.mark = "'", scientific = FALSE, trim = TRUE) + } + + labels <- c( + paste0("< ", fmt(breaks[1]), "k"), + paste0(fmt(breaks[-length(breaks)]), " - ", fmt(breaks[-1]), "k"), + paste0("> ", fmt(breaks[length(breaks)]), "k") + ) + return(labels) +} diff --git a/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb b/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb index 81eded2..4b8cf71 100644 --- a/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb +++ b/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb @@ -1,2355 +1,2438 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "f5827740-2917-4504-9017-9ec7d408e5f4", - "metadata": {}, - "source": [ - "Script structure:\n", - "\n", - " 0. Parameters: set back-up values for parameters, for when the notebook is run manually (_noy_ via pipeline)\n", - " 1. Setup:\n", - " * Paths\n", - " * Utils functions\n", - " 2. Load Data\n", - " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", - " * **Reporting** (DHIS2) pre-computed, already formatted & aggregated (output of pipeline ???)\n", - " * **Shapes** (DHIS2) for plotting (this could be removed if we move the plots to \"report/EDA\" nb)\n", - " 3. Calculate **Reportng Rate (RR)**\n", - " * \"**Dataset**\": using pre-computed reportings from DHIS2/SNIS (was: \"DHIS2\")\n", - " * \"**Data Element**\": using calculated expected nr of report (nr of active facilities) (was: \"CONF\")\n", - " 4. **Export** reporting rate data to `.../data/dhis2/reporting_rate/` as .parquet (and .csv) files for **either**:\n", - " * data**set**: \"XXX_reporting_rate_**dataset**.parquet\" **or**\n", - " * data**element**: \"XXX_reporting_rate_**dataelement**.parquet\"" - ] - }, - { - "cell_type": "markdown", - "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", - "metadata": {}, - "source": [ - "--------------------" - ] - }, - { - "cell_type": "markdown", - "id": "e962c5a4-6b09-4485-8d71-d842159118d3", - "metadata": {}, - "source": [ - "### To Do:\n", - "* For `DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\"`: **add code** to count OPEN facilities () for **countries with \"normal\" pyramids** (i.e., when no mixing of facilities and admin levels ... !). Atm only code for Niger, which runs only if `COUNTRY_CODE == NER`. Should add similar (but simpler) code for the rest of the countries (i.e, `COUNTRY_CODE != NER`)\n", - "* Check why Data Element **Denominator** `routine_active_facilities` is **calculated at `YEAR` (aggregated) instead of `MONTH`** ... possibly fix this to match granularity of other alternatives for denominator (which are calculated at MONTH level)\n", - "* Modify **report notebook** and/or pipeline.py code so that it does not make the **pipeline FAIL** if `reporting_rate_dataset` or `reporting_rate_dataelement` is **not found** (which is now always the case since we only output 1 file at each run!!)" - ] - }, - { - "cell_type": "markdown", - "id": "0cdfdc73-bb9a-48a8-a26b-84ecbab2e0aa", - "metadata": {}, - "source": [ - "----------------" - ] - }, - { - "cell_type": "markdown", - "id": "339f6d58-0965-40ef-b718-96195d2463f8", - "metadata": {}, - "source": [ - "## Parameters" - ] - }, - { - "cell_type": "markdown", - "id": "dd6cd6f8-b91b-4902-8801-a60e11776f98", - "metadata": {}, - "source": [ - "Set Default values **if _not_ provided by pipeline**
\n", - "This makes the execution flexible and \"safe\": nb can be run manually from here or be executed via pipeline, without having to change anything in the code!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93aac683-8828-4a42-b841-f16c7e8fbb07", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set BACKUP VALUE: root path - NEVER CHANGE THIS!\n", - "if (!exists(\"SNT_ROOT_PATH\")) {\n", - " SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "}\n", - "\n", - "\n", - "# Choose to run either DataSet OR DataElement method\n", - "if (!exists(\"REPORTING_RATE_METHOD\")) {\n", - " # REPORTING_RATE_METHOD <- \"DATASET\" \n", - " REPORTING_RATE_METHOD <- \"DATAELEMENT\"\n", - "}\n", - "\n", - "\n", - "# Data Elemenet method: Choice of which INDICATORS to use to count the nr of reporting facilities \n", - "# CONF\n", - "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_CONF\")) {\n", - " DATAELEMENT_METHOD_NUMERATOR_CONF <- TRUE # FALSE\n", - "}\n", - "\n", - "# SUSP\n", - "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_SUSP\")) {\n", - " DATAELEMENT_METHOD_NUMERATOR_SUSP <- TRUE # FALSE\n", - "}\n", - "\n", - "# TEST\n", - "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_TEST\")) {\n", - " DATAELEMENT_METHOD_NUMERATOR_TEST <- TRUE # FALSE\n", - "}\n", - "\n", - "\n", - "\n", - "# Data Elemenet RR. Choice: which df to use for nr of `EXPECTED_REPORTS` (DENOMINATOR) \n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {\n", - " # DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\" \n", - " DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\" \n", - " # DATAELEMENT_METHOD_DENOMINATOR <- \"DHIS2_EXPECTED_REPORTS\" # ⚠️ only if `REPORTING_RATE_METHOD == \"DATASET\"` && DataSet is available!! ⚠️\n", - "} \n" - ] - }, - { - "cell_type": "markdown", - "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", - "metadata": {}, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "markdown", - "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", - "metadata": {}, - "source": [ - "### 1.1. Paths" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# PROJECT PATHS\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') " - ] - }, - { - "cell_type": "markdown", - "id": "22971de0-1431-4cbd-b8c1-3bd3e1609e0d", - "metadata": {}, - "source": [ - "### 1.2. Utils functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1784fd43-03f3-478b-8148-4b478317ea21", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "source(file.path(CODE_PATH, \"snt_utils.r\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3bbcbd39-54e8-4ece-9244-30d7d30291d2", - "metadata": {}, - "source": [ - "### 1.3. Packages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "426ecff6-0b4c-474d-a48d-826002205b89", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# List required pcks ----------------> check what are the really required libraries\n", - "required_packages <- c(\"arrow\", # for .parquet\n", - " \"tidyverse\",\n", - " \"stringi\", \n", - " \"jsonlite\", \n", - " \"httr\", \n", - " \"reticulate\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "markdown", - "id": "18a8e0c1-ac09-4435-b6f4-5f91fd916396", - "metadata": {}, - "source": [ - "### 1.3.1. OpenHEXA-specific settings" - ] - }, - { - "cell_type": "markdown", - "id": "ebb8c7d5-7c2c-4dbe-a1ba-238419fbedf3", - "metadata": {}, - "source": [ - "#### For 📦{sf}, tell OH where to find stuff ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91a66fb7-dd5e-43fd-a6a2-d8bb9f0315d6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "markdown", - "id": "ac9ee427-020e-47c5-b2c9-5ca24e1f2779", - "metadata": {}, - "source": [ - "#### Set environment to load openhexa.sdk from the right path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa331278-573d-4a22-ab16-da6972d7b0be", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set environment to load openhexa.sdk from the right path\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", - "metadata": {}, - "source": [ - "### 1.4. Load and check `config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "\n", - "config_file_name <- \"SNT_config.json\" \n", - "config_json <- tryCatch({\n", - " jsonlite::fromJSON(file.path(CONFIG_PATH, config_file_name)) \n", - " },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, config_file_name))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "markdown", - "id": "29182f25-b0cf-46aa-9818-49616cd3f353", - "metadata": {}, - "source": [ - "**Save config fields as variables**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c52654c8-8a19-4e0c-a83b-1bc2eecae6bc", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Generic\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", - "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", - "\n", - "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"Ousmane\"\n", - "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "\n", - "# Which reporting rate PRODUCT_UID to use (not that this is a dataset in COD, but 2 dataElements in BFA!)\n", - "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "412572bc-fb96-4f61-ac49-be7f449219b6", - "metadata": {}, - "outputs": [], - "source": [ - "# DHIS2_INDICATORS\n", - "log_msg(paste(\"Expecting the following DHIS2 (aggregated) indicators : \", paste(DHIS2_INDICATORS, collapse=\", \")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a0a8562-4a70-455c-9ccf-aa39f4cf4e31", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Fixed cols for routine data formatting \n", - "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') # (OU_NAME has homonimous values!)\n", - "# print(paste(\"Fixed routine data (`dhis2_routine`) columns (always expected): \", paste(fixed_cols, collapse=\", \")))\n", - "log_msg(paste(\"Expecting the following columns from routine data (`dhis2_routine`) : \", paste(fixed_cols, collapse=\", \")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86e82d54-2b00-4c25-9b34-3497d4c88c52", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Fixed cols for exporting RR tables: to export output tables with consistent structure\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') " - ] - }, - { - "cell_type": "markdown", - "id": "dadc7351-e67e-450b-a046-bc64660a7dde", - "metadata": {}, - "source": [ - "### 1.5. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Alternatively, `CONF` could be made mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cf6e2a4-0822-4a0c-852e-143da5473d20", - "metadata": {}, - "outputs": [], - "source": [ - "nr_of_indicators_selected <- sum(DATAELEMENT_METHOD_NUMERATOR_CONF, DATAELEMENT_METHOD_NUMERATOR_SUSP, DATAELEMENT_METHOD_NUMERATOR_TEST)\n", - "\n", - "if (nr_of_indicators_selected == 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method 'Data Element'! Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", - "metadata": {}, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", - "metadata": {}, - "source": [ - "### 2.1. **Routine** data (DHIS2) \n", - "already formatted & aggregated (output of pipeline XXX)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "586e8da8-4e1c-431a-9b8d-1169167e1c09", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# DHIS2 Dataset extract identifier\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"DHIS2 routine data loaded from dataset : \", dataset_name, \" dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2454183-44f7-4e2e-a0cf-ca112aa183bb", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Ensure correct data type for numerical columns \n", - "dhis2_routine <- dhis2_routine %>%\n", - " mutate(across(c(PERIOD, YEAR, MONTH), as.numeric))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edb2fcdc-ce0a-4c78-b06a-9f4610ab4714", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(dhis2_routine, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "821e1ebf-b2fa-4469-974e-2e4d27d58854", - "metadata": {}, - "source": [ - "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", - "Only when: `DATAELEMENT_METHOD_NUMERATOR == \"CONF|SUSP|TEST\"`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3f8b89e-a04e-4e0b-9892-95ce2150e7da", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(dhis2_routine, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "adec5412", - "metadata": {}, - "source": [ - "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", - "Based on which indicator(s) are selected (if any)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0bbcdf8c-873a-4b41-980a-f18d1863ab8f", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize empty vector\n", - "indicators_selected = c()\n", - "\n", - "# Add elements based on user selection(s)\n", - "if (DATAELEMENT_METHOD_NUMERATOR_CONF) {\n", - " indicators_selected = append(indicators_selected, \"CONF\")\n", - "}\n", - "\n", - "if (DATAELEMENT_METHOD_NUMERATOR_SUSP) {\n", - " indicators_selected = append(indicators_selected, \"SUSP\")\n", - "}\n", - "\n", - "if (DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", - " indicators_selected = append(indicators_selected, \"TEST\")\n", - "}\n", - "\n", - "print(paste0(\"Selected indicators: \", paste(indicators_selected, collapse = \", \")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b84753f8-aa9c-4563-beae-5e29b3f1e773", - "metadata": {}, - "outputs": [], - "source": [ - "# This is kinda useless now but KEEP in case we ADD MORE CHOICES OF INDICATORS!! \n", - "if(REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " if (DATAELEMENT_METHOD_NUMERATOR_CONF | DATAELEMENT_METHOD_NUMERATOR_SUSP | DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", - " log_msg(paste0(\"Indicator(s) \", paste(indicators_selected, collapse = \", \") , \" selected for calculation of numerator for method `Data Element`.\" ))\n", - " \n", - " if ( length(which(indicators_selected %in% names(dhis2_routine))) < length(indicators_selected) ) {\n", - " log_msg(paste0(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: \", paste(expected_col, collapse = \", \"), \".\"), \"warning\")\n", - " } \n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "c832da26-fe0c-43fe-8300-2fff5c4cbf34", - "metadata": {}, - "source": [ - "### 2.2. **Reporting** pre-computed from DHIS2 \n", - "Data granularity:\n", - "* **ADM2**\n", - "* **MONTH** (PERIOD)\n", - "\n", - "Note: data comes from different dataset (`DS_NAME`): `A SERVICES DE BASE`, `B SERVICES SECONDAIRES`,`D SERVICE HOPITAL` \n", - "\n", - "The col `DS_METRIC` indicates whether the `VALUE` is `EXPECTED_REPORTS` or `ACTUAL_REPORTS`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ce295b9-9898-4e12-8a91-92bb25b9e0a2", - "metadata": {}, - "outputs": [], - "source": [ - "# REPORTING_RATE_METHOD <- \"DATAELEMENT\" # \"DATASET\"\n", - "REPORTING_RATE_METHOD" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a32fc96-5b8e-4108-a224-c0d843df9b47", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " # DHIS2 Dataset extract identifier\n", - " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - " file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", - " \n", - " # Load file from dataset\n", - " dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pre-computed REPORTING data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - " \n", - " msg <- paste0(\"DHIS2 pre-computed REPORTING data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). Dataframe dimensions: \", \n", - " paste(dim(dhis2_reporting), collapse=\", \"))\n", - " log_msg(msg)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e131d9ee-0e88-4bb6-982b-53b1229fba5f", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " # Convert VALUE col to - should not be needed but keep as safety measure \n", - " dhis2_reporting <- dhis2_reporting |>\n", - " mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", - "\n", - " head(dhis2_reporting, 3)\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46e3dba8-d46b-457e-ba90-c663e30c42d2", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# # Convert VALUE col to - should not be needed but keep as safety measure \n", - "# dhis2_reporting <- dhis2_reporting |>\n", - "# mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5149befe-b6ad-46a9-9879-7637ce5b02be", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# head(dhis2_reporting, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "7a967af3-f6e5-428a-8769-72808f21a125", - "metadata": {}, - "source": [ - "#### 2.2.1. **Filter** to keep only values for `PRODUCT_UID` defined in config.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1948c2f7-7a2c-47a2-9dc6-ba29da6d030c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "REPORTING_RATE_PRODUCT_ID" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4258098-e24c-4520-914d-0f73354bb3ab", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - "\n", - " # Handle problems with incorrect configuration - to be improved 🚧\n", - " if (is.null(REPORTING_RATE_PRODUCT_ID)) {\n", - " log_msg(\"🛑 Problem with definition of REPORTING_RATE_PRODUCT_ID, check `SNT_config.json` file!\")\n", - " } else \n", - " product_name <- dhis2_reporting |> filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |> pull(PRODUCT_NAME) |> unique()\n", - " log_msg(glue::glue(\"Using REPORTING_RATE_PRODUCT_ID == `{REPORTING_RATE_PRODUCT_ID}`, corresponding to DHIS2 Product name : `{product_name}`.\"))\n", - "\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c22c6ada-7cb1-4fca-b65e-b51e5eca35a2", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - "\n", - " dhis2_reporting_filtered <- dhis2_reporting |>\n", - " filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |>\n", - " select(-PRODUCT_UID, -PRODUCT_NAME) # useless cols now\n", - " \n", - " print(dim(dhis2_reporting_filtered))\n", - " head(dhis2_reporting_filtered)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9da035cf-5d3f-4df0-a063-2d2497616c82", - "metadata": {}, - "source": [ - "#### 2.2.2. Format to produce `dhis2_reporting_expected`\n", - "🚨 Note: Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denominator for REPORTING_RATE calculations (methods dataset and dataelement)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e970f9d-258e-4050-ae69-185b88c79fc3", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " \n", - " dhis2_reporting_wide <- dhis2_reporting_filtered |> \n", - " pivot_wider(\n", - " names_from = PRODUCT_METRIC, \n", - " values_from = VALUE\n", - " )\n", - " \n", - " print(dim(dhis2_reporting_wide))\n", - " head(dhis2_reporting_wide)\n", - " \n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eab31756-ae6b-4152-8ec3-8195236d8732", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denomitor for RR calculations (methods ANY and CONF)\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " \n", - " dhis2_reporting_expected <- dhis2_reporting_wide |> \n", - " select(-ACTUAL_REPORTS)\n", - " \n", - " print(dim(dhis2_reporting_expected))\n", - " head(dhis2_reporting_expected)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3c3d0f35-889d-4c16-9741-d6e75e2ef096", - "metadata": {}, - "source": [ - "#### 2.2.3. **Checks** on data completeness: _do **periods match** with routine data?_\n", - "Lack of perfect overlap in periods between routine data and reporting rate data might create headhaches downstream!
\n", - "Specifically, **incidence** calculations will show **N2 smaller than N1** due to **aggregation by YEAR when NA** values are present!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ea57600-418b-45bc-805a-f829e237b4c4", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " \n", - " # --- Check Year Compatibility ---\n", - " routine_years <- sort(unique(as.integer(dhis2_routine$YEAR))) # as.integer\n", - " expected_years <- sort(unique(as.integer(dhis2_reporting_expected$YEAR))) # as.integer\n", - " \n", - " if (!setequal(routine_years, expected_years)) {\n", - " missing_in_routine <- setdiff(expected_years, routine_years)\n", - " missing_in_expected <- setdiff(routine_years, expected_years)\n", - " \n", - " if (length(missing_in_routine) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_reporting_expected' but not in 'dhis2_routine': \",\n", - " paste(missing_in_routine, collapse = \", \")))\n", - " }\n", - " if (length(missing_in_expected) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_routine' but not in 'dhis2_reporting_expected': \",\n", - " paste(missing_in_expected, collapse = \", \")))\n", - " }\n", - " } else {\n", - " log_msg(\"✅ YEAR values are consistent across 'dhis2_routine' and 'dhis2_reporting_expected'.\")\n", - " \n", - " # --- Check Month Compatibility (if years are consistent) ---\n", - " all_years <- unique(routine_years) # Or expected_years, they are the same now\n", - " \n", - " for (year_val in all_years) {\n", - " routine_months_for_year <- dhis2_routine %>%\n", - " filter(YEAR == year_val) %>%\n", - " pull(MONTH) %>%\n", - " unique() %>%\n", - " sort()\n", - " \n", - " expected_months_for_year <- dhis2_reporting_expected %>%\n", - " filter(YEAR == year_val) %>%\n", - " pull(MONTH) %>%\n", - " unique() %>%\n", - " sort()\n", - " \n", - " if (!setequal(routine_months_for_year, expected_months_for_year)) {\n", - " missing_in_routine_months <- setdiff(expected_months_for_year, routine_months_for_year)\n", - " missing_in_expected_months <- setdiff(routine_months_for_year, expected_months_for_year)\n", - " \n", - " if (length(missing_in_routine_months) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_routine_months, collapse = \", \"),\n", - " \"' present in 'dhis2_reporting_expected' but not in 'dhis2_routine'!\"\n", - " ))\n", - " }\n", - " if (length(missing_in_expected_months) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_expected_months, collapse = \", \"), \n", - " \"' present in 'dhis2_routine' but not in 'dhis2_reporting_expected'!\"\n", - " ))\n", - " }\n", - " } else {\n", - " log_msg(paste0(\"✅ For year \", year_val, \", months are consistent across both data frames.\"))\n", - " }\n", - " }\n", - " }\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "5e711191-995b-4f89-b10c-fc2214cdd8b2", - "metadata": {}, - "source": [ - "### 2.3. **Pyramid** to count OPEN facilities (denominator)\n", - "Table (and column) needed for denominator of \"Data Element\" reporting rate if choice == `PYRAMID_OPEN_FACILITIES`\n", - "\n", - "**Important**: the pyramid must contain the `OPENING_DATE` and `CLOSING_DATE` columns (this was implemented in the new extraction pipeline from 2025-09).
\n", - "Then, **depending on the Country** (well, theire pyramid structure) **import** either:\n", - "* **Raw** pyramid for 🇳🇪 Niger: because first need to \"manually\" correctly aggregate the VALUEs for the HF (separate them from admin levels and sum up HD units)\n", - "* **Formatted** pyramid for all other countries encountered so far: 🇨🇩 DRC, 🇧🇫 Burkina Faso ... bevcause their pyramid is already usable right away" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ad3f83b6-2fdd-45da-8a4d-fb06513b6be2", - "metadata": {}, - "outputs": [], - "source": [ - "# DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\"\n", - "DATAELEMENT_METHOD_DENOMINATOR" - ] - }, - { - "cell_type": "markdown", - "id": "e7b80b6e-9e34-4e71-93e8-7e16a110e17c", - "metadata": {}, - "source": [ - "#### **Raw** pyramid for 🇳🇪 **Niger**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "652cf1a7-c9a2-48db-b44d-8fabfd0e072f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - " # DHIS2 Dataset extract identifier\n", - " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", - " \n", - " # Load file from dataset\n", - " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", - " dhis2_pyramid_raw <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pyramid RAW data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - " \n", - " msg <- paste0(\"DHIS2 RAW pyramid data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_raw), collapse=\", \"))\n", - " log_msg(msg)\n", - " \n", - " head(dhis2_pyramid_raw)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "f1716ac3-ce8f-4223-9729-6ed826e743bc", - "metadata": {}, - "source": [ - "#### **Formatted** pyramid for all other countries (normal pyramid) 🇨🇩 🇧🇫" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fc16ae54-4915-4333-b458-2b611e2b1792", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", - " \n", - " # DHIS2 Dataset extract identifier\n", - " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - " \n", - " # Load file from dataset\n", - " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", - " dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - " \n", - " msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", - " log_msg(msg)\n", - " \n", - " head(dhis2_pyramid_formatted)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "eb4c5c63-d140-46b8-b686-886e612a31dc", - "metadata": {}, - "source": [ - "## 3. Calculate **Reporting Rate** (RR)\n", - "We compute it using 2 approaches, user can decided later on which one to use for incidence adjustment." - ] - }, - { - "cell_type": "markdown", - "id": "cb724aa8-5f06-4e99-aeca-640d0c1b049e", - "metadata": {}, - "source": [ - "## 3.1. \"**Dataset**\" reporting rate: pre-computed, from **DHIS2**\n", - "Exrtacted from DHIS2 and formatted. \n", - "\n", - "Straightforward: `ACTUAL_REPORTS` / `EXPECTED_REPORTS` (just pivot `DS_METRIC` and divide)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10b2f52b-0217-43f1-88a3-cd01d98869b1", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - "\n", - " reporting_rate_dataset <- dhis2_reporting_wide |> \n", - " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", - " \n", - " print(dim(reporting_rate_dataset))\n", - " head(reporting_rate_dataset, 3)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3d49eda8-b4fd-437a-8938-17bf0806f281", - "metadata": {}, - "source": [ - "#### Quick data quality check 🔍" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cff33416-ea66-4eeb-9d33-1597c2f05b0c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- Define function ---------------------------\n", - "inspect_reporting_rate <- function(data_tibble) {\n", - "\n", - " # Dynamically get the name of the tibble passed to the function\n", - " # Extract the litteral name of the variable passed (e.g., \"reporting_rate_dhis2_month\")\n", - " tibble_name_full <- deparse(substitute(data_tibble))\n", - "\n", - " # Extract the 'method' part from the tibble name\n", - " method <- stringr::str_extract(tibble_name_full, \"(?<=reporting_rate_).*\") # \"(?<=reporting_rate_).*?(?=_month)\"\n", - "\n", - " # Calculations for proportion of values > 1\n", - " values_greater_than_1 <- sum(data_tibble$REPORTING_RATE > 1, na.rm = TRUE)\n", - " total_values <- length(data_tibble$REPORTING_RATE)\n", - "\n", - " if (total_values > 0) {\n", - " proportion <- values_greater_than_1 / total_values * 100\n", - " min_rate <- min(data_tibble$REPORTING_RATE, na.rm = TRUE)\n", - " max_rate <- max(data_tibble$REPORTING_RATE, na.rm = TRUE)\n", - " } else {\n", - " proportion <- 0\n", - " min_rate <- NA # Set to NA if no values to calculate min/max\n", - " max_rate <- NA # Set to NA if no values to calculate min/max\n", - " }\n", - "\n", - " if (proportion == 0) {\n", - " clarification = NULL\n", - " } else {\n", - " clarification = \" (there are more reports than expected)\"\n", - " }\n", - "\n", - " # Print the formatted result\n", - " log_msg(\n", - " paste0(\n", - " \"🔍 For reporting rate method : `\", method, \"`, the values of REPORTING_RATE range from \", round(min_rate, 2),\n", - " \" to \", round(max_rate, 2),\n", - " \", and \", round(proportion, 2), \" % of values are >1\", clarification, \".\"\n", - " )\n", - " )\n", - "\n", - " # Histogram\n", - " hist(data_tibble$REPORTING_RATE, \n", - " breaks = 50)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2f4c11c-c683-4204-ab91-9d41cab4826c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " inspect_reporting_rate(reporting_rate_dataset)\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "04870e93-5385-425b-89fd-b815a87cfa21", - "metadata": {}, - "source": [ - "#### Subset cols" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d90671b0-36f8-4c6e-8736-4ea807079f83", - "metadata": { - "scrolled": true, - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " reporting_rate_dataset <- reporting_rate_dataset |> \n", - " select(all_of(fixed_cols_rr))\n", - " \n", - " dim(reporting_rate_dataset)\n", - " head(reporting_rate_dataset, 3)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62e6cb16-0196-447f-b142-aaec2120eecb", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "2dc27c07-80cd-465e-891f-9fb70111dbb0", - "metadata": {}, - "source": [ - "#### Plot by MONTH (heatmap)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9ce56fc-f86a-4a2b-95b7-fb6ec5b89087", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", - " \n", - " # Plot reporting rate heatmap\n", - " options(repr.plot.width = 20, repr.plot.height = 10) \n", - " \n", - " # reporting_rate_conf_month %>%\n", - " reporting_rate_dataset %>%\n", - " mutate(\n", - " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", - " ) %>%\n", - " ggplot(., aes(x = DATE, \n", - " y = factor(ADM2_ID), \n", - " fill = REPORTING_RATE * 100)\n", - " ) + \n", - " geom_tile() +\n", - " scale_fill_viridis_c(\n", - " option = \"C\",\n", - " direction = 1, # blue = low, yellow = high\n", - " limits = c(0, 100),\n", - " name = \"Reporting rate (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Monthly Reporting Rate by Health District - Method 'DataSet'\",\n", - " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", - " x = \"Month\",\n", - " y = \"Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", - " legend.position = \"right\",\n", - " panel.grid = element_blank()\n", - " )\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00bf15b7-baa7-4734-8133-8d4a9cc843a3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "40b21c65-1b75-42f7-821a-24d31e436c73", - "metadata": {}, - "source": [ - "----------------------------" - ] - }, - { - "cell_type": "markdown", - "id": "17ffece4-9420-4004-993b-b5692cc1d2de", - "metadata": {}, - "source": [ - "## 3.2. **Data Element** reporting rate: based on reporting of one or more indicators\n", - "**_Partially_ following methods by WHO and as per Diallo (2025) paper**\n", - "\n", - "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single indicator (i.e., **confirmed** malaria case as `CONF`) or for _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", - "\n", - "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", - "\n", - "For this method the user is allowed to **chose** how to calculate both the **numerator** and the **denominator**.
\n", - "Specifically:\n", - "* **Numerator**: is the number of **facilities that _actually reported_** data, and it is estimated based on whether a facility (FoSa, or HF, or `OU_ID`) **submitted data** for **_any_** of the following **indicators**:\n", - " * `CONF`: confirmed malaria cases and/or\n", - " * `SUSP`: suspected malaria cases and/or\n", - " * `TEST`: tested malaria cases
\n", - " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", - "\n", - "
\n", - " \n", - "* **Denominator**: is the number of **facilities _expected_ to report**. This number can be obtained in two different ways:\n", - " * `\"DHIS2_EXPECTED_REPORTS\"`: uses the col `EXPECTED_REPORTS` from the df `dhis2_reporting_expected`.
\n", - " This is obtained directly from DHIS2, and is the same denominator used to calculate the \"Dataset\" reporting rate.\n", - " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", - " This is calculated as the number of \"**active**\" facilities (`OU_ID`), defined as those that submitted _any_ data **at least once in a given year**, across ***all*** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", - "\n", - "
\n", - "\n", - "This method improves over simple binary completeness flags by accounting for both spatial (facility coverage) and temporal (monthly timeliness) dimensions.
" - ] - }, - { - "cell_type": "markdown", - "id": "f5dcd3b9-6f02-4fc5-9e5f-2253c015a3d4", - "metadata": {}, - "source": [ - "### Calculate the **numerator**" - ] - }, - { - "cell_type": "markdown", - "id": "a90d9f4a-a058-4ad5-8ef2-f827987b5def", - "metadata": {}, - "source": [ - "**Note**: the col `REPORTED` keeps the same name regardless of the value of `DATAELEMENT_METHOD_NUMERATOR` because \n", - "in this way the code needs to be parametrized only once (here).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8076609c-46e8-478a-8283-bc63a70102f8", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "dhis2_routine_active <- dhis2_routine %>%\n", - " mutate(\n", - " # if_any() returns TRUE if the condition is met for any of the selected columns\n", - " ACTIVE = if_else(if_any(all_of(indicators_selected), ~ !is.na(.x)), 1, 0)\n", - " )\n", - "\n", - "log_msg(paste0(\"Evaluating reporting facilities based on indicators: \", paste(indicators_selected, collapse = \", \"), \".\"))\n", - "\n", - "dim(dhis2_routine_active)\n", - "head(dhis2_routine_active, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "325faf35-ed25-4b8e-b421-934a2852f27e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1773313-17e5-478d-b60d-c1193233204d", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# --- 1. Calculate `SUBMITTED_REPORTS` as the nr of ACTIVE facilities (that REPORTED, each month) ------------------------\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "dhis2_routine_submitted <- dhis2_routine_active %>% # OLD: dhis2_routine_reporting_month <- dhis2_routine_reporting %>%\n", - " group_by(ADM2_ID, YEAR, MONTH) %>% \n", - " summarise(\n", - " SUBMITTED_REPORTS = sum(ACTIVE, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " ungroup() %>% \n", - " mutate(YEAR = as.integer(YEAR),\n", - " MONTH = as.integer(MONTH)\n", - " ) \n", - "\n", - "print(dim(dhis2_routine_submitted))\n", - "head(dhis2_routine_submitted, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a25647e3-5674-44e0-855e-c3a48483310d", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "15f4c12f", - "metadata": {}, - "source": [ - "### Calculate the **denominator**" - ] - }, - { - "cell_type": "markdown", - "id": "06b2070d-c672-425f-a78f-b94a8d16a017", - "metadata": {}, - "source": [ - "#### Option: `ROUTINE_ACTIVE_FACILITIES`\n", - "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`ROUTINE_ACTIVE_FACILITIES`** " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08f03ed1-5831-4fe5-8bde-674a513e8110", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Calculate the tot nr of facilities (distinct OU_ID) based on all HF that appear in the routine data (each YEAR)\n", - "# meaning: regardless of what indicators they submit data for, as long as they have submitted something\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", - " routine_active_facilities <- dhis2_routine %>%\n", - " # Keep only rows where at least one indicator has non-NA value\n", - " filter(if_any(any_of(DHIS2_INDICATORS), ~ !is.na(.))) %>%\n", - " group_by(YEAR, ADM2_ID) %>%\n", - " summarize(\n", - " EXPECTED_REPORTS = n_distinct(OU_ID),\n", - " .groups = \"drop\" # remove grouping \n", - " )\n", - "\n", - " nr_of_rows <- nrow(routine_active_facilities)\n", - " log_msg(glue::glue(\"Produced df `routine_active_facilities`, with column `EXPECTED_REPORTS` calculated from DHIS2 routine data. Dataframe `routine_active_facilities` has {nr_of_rows} rows.\"))\n", - "\n", - " head(routine_active_facilities, 3)\n", - " \n", - "} \n" - ] - }, - { - "cell_type": "markdown", - "id": "6629dccb-97b0-4b0e-b23f-15b98704323d", - "metadata": {}, - "source": [ - "#### Option: `PYRAMID_OPEN_FACILITIES`\n", - "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`PYRAMID_OPEN_FACILITIES`** " - ] - }, - { - "cell_type": "markdown", - "id": "0972ffca-c14a-4b93-85ff-027d056c3759", - "metadata": {}, - "source": [ - "------------------" - ] - }, - { - "cell_type": "markdown", - "id": "d49219b7-5932-4062-a10d-e1f3a4a81449", - "metadata": {}, - "source": [ - "#### TEMPORARY! 🇳🇪 **Niger-specific method**\n", - "🚨 Specific to **Niger EnDoP**: Pre-processing needed to separate facilities from adm levels!! 🚨
\n", - "\n", - "⚠️⚠️⚠️ **TEMPORARY: This will be moved to a dedicated pipeline!** ⚠️⚠️⚠️
\n", - "\n", - "Specifically:\n", - "* **Hospital**s (HD a Hopital District): at **level 4** together with Aires de Sante\n", - "* All other **FoSa**s: at **level 6**, also mixed with the hospital units\n", - "\n", - "Therefore, to assigned closed/open status, it is necessary to attach to each individual facility the closng and opening data column. \n", - "To do this: \n", - "1) extract list of facilities and id across the 2 levels (4 and 6) and\n", - "2) calculate the nr of open facilities per MONTH (PERIOD) per ADM2, ending up with a df with cols: `ADM2_ID`, `YEAR`, `MONTH`, `OPEN_FACILITIES_COUNT` = `EXPECTED_REPORTS`\n", - "3) add this to the df with the **numerator** (`dhis2_routine_submitted`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6296329-9bcd-4d2c-afb3-520c6a159cdb", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "# names(dhis2_pyramid_raw)\n", - "dim(dhis2_pyramid_raw)\n", - "head(dhis2_pyramid_raw, 3)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "6f651b18-2d85-4e26-8952-45dae9020c40", - "metadata": {}, - "source": [ - "#### 1. Create df with list of all **facilities** with their `DATE_OPENED` and `DATE_CLOSED`: `facility_master`\n", - "Separate \"facilities\" (of any type, such as hospitals to CSI, Infermieres etc) from admin levels and hospital units (wards, depts...)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25dea3c7-44ea-42a7-b467-f470892fcfef", - "metadata": {}, - "outputs": [], - "source": [ - "# Helpers to detect Aires and Hospitals:\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "is_aire_l5 <- function(x) str_detect(x, regex(\"^\\\\s*aire[^a-zA-Z]?\", ignore_case = TRUE))\n", - "is_hospital_l4 <- function(x) str_detect(x, regex(\"^(hd|chr|chu|hgr)\", ignore_case = TRUE))\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55760b89-f4d0-40c5-9905-f7c7c4fee5c0", - "metadata": {}, - "outputs": [], - "source": [ - "# List of all FoSa (from Aires → Level 6)\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "fosa_master <- dhis2_pyramid_raw %>%\n", - " filter(is_aire_l5(LEVEL_5_NAME)) %>%\n", - " distinct(\n", - " OU_ID = LEVEL_6_ID,\n", - " OU_NAME = LEVEL_6_NAME,\n", - " region = LEVEL_2_NAME,\n", - " district = LEVEL_3_NAME,\n", - " ADM2_ID = LEVEL_3_ID,\n", - " DATE_OPENED = OPENING_DATE, \n", - " DATE_CLOSED = CLOSED_DATE\n", - " ) %>%\n", - " mutate(OU_TYPE = \"FoSa\")\n", - "\n", - "dim(fosa_master)\n", - "head(fosa_master)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e9c9f59-9c6c-44e4-bbd9-13c3917f5117", - "metadata": {}, - "outputs": [], - "source": [ - "# List of all Hospitals (from Level 4, aggregate dates across children)\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "hosp_master <- dhis2_pyramid_raw %>%\n", - "filter(is_hospital_l4(LEVEL_4_NAME)) %>%\n", - "group_by(LEVEL_4_ID, LEVEL_4_NAME, LEVEL_2_NAME, LEVEL_3_NAME, LEVEL_3_ID) %>%\n", - "summarise(\n", - " OPENING_DATE = suppressWarnings(min(OPENING_DATE, na.rm = TRUE)),\n", - " CLOSED_DATE = suppressWarnings(max(CLOSED_DATE, na.rm = TRUE)),\n", - " .groups = \"drop\"\n", - ") %>%\n", - "mutate(\n", - " DATE_OPENED = ifelse(is.infinite(OPENING_DATE), NA, OPENING_DATE) |> as_datetime(),\n", - " DATE_CLOSED = ifelse(is.infinite(CLOSED_DATE), NA, CLOSED_DATE) |> as_datetime()\n", - " ) %>%\n", - "distinct(\n", - " OU_ID = LEVEL_4_ID, \n", - " OU_NAME = LEVEL_4_NAME,\n", - " region=LEVEL_2_NAME,\n", - " district=LEVEL_3_NAME,\n", - " ADM2_ID=LEVEL_3_ID,\n", - " DATE_OPENED,\n", - " DATE_CLOSED\n", - ") %>%\n", - "mutate(\n", - " OU_TYPE = \"Hospital\"\n", - " )\n", - "\n", - "dim(hosp_master)\n", - "head(hosp_master)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5859e393-bfac-46e6-b103-cb8177100860", - "metadata": {}, - "outputs": [], - "source": [ - "# Merge both\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "facility_master <- bind_rows(fosa_master, hosp_master) %>% \n", - " select(ADM2_ID, \n", - " OU_ID, \n", - " DATE_OPENED, \n", - " DATE_CLOSED)\n", - "\n", - "dim(facility_master)\n", - "head(facility_master, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "191c3b01-1645-410e-be99-b247bf5f9cfb", - "metadata": {}, - "source": [ - "---------------------" - ] - }, - { - "cell_type": "markdown", - "id": "1ee48156-5ed5-43a5-b927-caa53c10d98e", - "metadata": {}, - "source": [ - "#### **Generic** part: applies to **all countries**" - ] - }, - { - "cell_type": "markdown", - "id": "3aa057a5-6f68-493e-83e2-81bafce42c9e", - "metadata": {}, - "source": [ - "#### 2. Calculate nr of **OPEN facilities** for each `MONTH` per `ADM2`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91f972f1-bcc9-458f-9662-5574efc7ac9d", - "metadata": {}, - "outputs": [], - "source": [ - "# Define start and end period based on routine data \n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "PERIOD_START <- dhis2_routine$PERIOD |> min()\n", - "PERIOD_END <- dhis2_routine$PERIOD |> max()\n", - "\n", - "print(paste0(\"Start period: \", PERIOD_START))\n", - "print(paste0(\"End period :\", PERIOD_END))\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c6ed56f-b2a6-460c-a3dc-6c588c40b54c", - "metadata": {}, - "outputs": [], - "source": [ - "## Create a \"complete\" grid of every month and year for the period range ---------------------------------------\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "months_grid <- tibble(\n", - " month_date = seq(\n", - " ymd(paste0(PERIOD_START, \"01\")), # Converts 202201 to \"20220101\" and then to a date\n", - " ymd(paste0(PERIOD_END, \"01\")), # same\n", - " by = \"months\"\n", - " )\n", - ") %>%\n", - " mutate(\n", - " YEAR = year(month_date),\n", - " MONTH = month(month_date)\n", - " )\n", - "\n", - "dim(months_grid) \n", - "head(months_grid, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb7d9a47-053e-43a5-9e0d-c7b717236f3e", - "metadata": {}, - "outputs": [], - "source": [ - "## Create `facility_master` for any (🚨 non-NER) countries\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", - "\n", - " # Programmatically define `ADM2_ID`\n", - " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", - " # Programmatically define `OU_ID`\n", - " HF_LEVEL <- glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\")\n", - "\n", - " facility_master <- dhis2_pyramid_formatted |>\n", - " mutate(\n", - " DATE_OPENED = with_tz(OPENING_DATE, \"UTC\"),\n", - " DATE_CLOSED = with_tz(CLOSED_DATE, \"UTC\")\n", - " ) |>\n", - " select(\n", - " ADM2_ID = all_of(ADMIN_2_LEVEL), \n", - " OU_ID = all_of(HF_LEVEL),\n", - " DATE_OPENED, #= OPENING_DATE,\n", - " DATE_CLOSED #= CLOSED_DATE\n", - ")\n", - "\n", - "head(facility_master)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f283cc96-ed69-43a3-964e-57ccb0180a4a", - "metadata": {}, - "outputs": [], - "source": [ - "## Create a \"complete\" grid of every ADM2_ID for every month ---------------------------------------\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "# This ensures that even if an ADM2_ID has zero open facilities in a month,\n", - "# it will still appear in the final result with a count of 0.\n", - "complete_grid <- expand_grid(\n", - " ADM2_ID = unique(facility_master$ADM2_ID),\n", - " month_date = months_grid$month_date\n", - ") %>%\n", - " mutate(\n", - " YEAR = year(month_date),\n", - " MONTH = month(month_date),\n", - " month_date = with_tz(as_datetime(month_date), \"UTC\") # GP added 0809\n", - " )\n", - "\n", - "head(complete_grid, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e905c46-036a-4aa9-85ad-216f846f9e1b", - "metadata": {}, - "outputs": [], - "source": [ - "## Calculate the number of open facilities ---------------------------------------\n", - "\n", - "# # The facility must have opened on or before the last day of the current month. \n", - "# # To calculate the last day: add one month and subtract one day from the first day.\n", - "# complete_grid$month_date[1] # \"2022-01-01\"\n", - "# complete_grid$month_date[1] + months(1) - days(1) # \"2022-01-31\"\n", - "# # The facility must either still be open (DATE_CLOSED is NA) OR it must have closed on or after the first day of that month.\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "open_facilities_count <- facility_master %>%\n", - " # Create a row for every possible combination of facility and month\n", - " crossing(months_grid) %>%\n", - " # A facility is \"open\" if it opened BEFORE the end of the month\n", - " # AND it either never closed (NA) or closed AFTER the start of the month.\n", - " filter(\n", - " DATE_OPENED <= month_date + months(1) - days(1) & # opened on or before the last day of the current month\n", - " (is.na(DATE_CLOSED) | DATE_CLOSED >= month_date) # \n", - " ) %>%\n", - " # Count the number of open facilities for each area and month\n", - " count(ADM2_ID, YEAR, MONTH, name = \"OPEN_FACILITIES_COUNT\")\n", - "\n", - "head(open_facilities_count, 3)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ba854f2-5925-4154-b86e-3e4e7bb6c363", - "metadata": {}, - "outputs": [], - "source": [ - "## Join the counts back to the complete grid to include zeros --------------------------------------\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "pyramid_open_facilities <- complete_grid %>%\n", - " left_join(open_facilities_count, by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>%\n", - " # If a month had no open facilities, the count will be NA. Change it to 0.\n", - " # Also rename `OPEN_FACILITIES_COUNT` to `EXPECTED_REPORTS` to use same col name as other methods\n", - " mutate(OPEN_FACILITIES_COUNT = replace_na(OPEN_FACILITIES_COUNT, 0)) %>% # DENOMINATOR: consistent col name across all methods \n", - " select(ADM2_ID, YEAR, MONTH, \n", - " EXPECTED_REPORTS = OPEN_FACILITIES_COUNT) %>%\n", - " arrange(ADM2_ID, YEAR, MONTH)\n", - "\n", - "print(dim(pyramid_open_facilities))\n", - "head(pyramid_open_facilities, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff1a8537-d093-4d5c-8a44-4b729090cced", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "811310a9-df85-4fa3-af9a-b931eaffd7e5", - "metadata": {}, - "source": [ - "### Calculate **Reporting Rate** " - ] - }, - { - "cell_type": "markdown", - "id": "8827cfd6-479b-4025-a379-d20bf20fcfb4", - "metadata": {}, - "source": [ - "**Join df for Denominator**\n", - "\n", - "**Note**
\n", - "in both df's (`dhis2_reporting_expected` OR `routine_active_facilities`) the col `EXPECTED_REPORTS` has the same name to simplify parametrization: only difference between the 2 options is the df to be joined (right element in `left_join()`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "670508a0-3075-4f82-aa2c-d26cf867f13d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- 2. Join `dhis2_reporting_expected` OR `dhis2_calculated_expected` to add `EXPECTED_REPORTS` ------------------------------------------------\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "# Parametrized based on DATAELEMENT_METHOD_DENOMINATOR: left_join() the respective df\n", - "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " # Add df of rep rate extracted directly from DHIS2\n", - " dhis2_routine_submitted_expected <- left_join(\n", - " dhis2_routine_submitted, \n", - " dhis2_reporting_expected |> select(ADM2_ID, YEAR, MONTH, EXPECTED_REPORTS), # `dhis2_reporting_expected`\n", - " by = join_by(ADM2_ID, YEAR, MONTH)\n", - " ) \n", - " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` extracted directly from DHIS2.\")\n", - " \n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", - " # Add df of rep rate CALCULATED based on submissiosn in dhis2 routine data \"active\" facilities\n", - " dhis2_routine_submitted_expected <- left_join(\n", - " dhis2_routine_submitted, \n", - " routine_active_facilities, # has only cols: `YEAR`, `ADM2_ID`, `EXPECTED_REPORTS`\n", - " by = join_by(ADM2_ID, YEAR) #, MONTH)\n", - " ) \n", - " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 routine data. Here, ACTIVE facilities \n", - " are defined as facilities that reported on any of the extracted indicators at least once per year.\")\n", - " \n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " # Add df of rep rate CALCULATED based on OPEN facilities as per PYRAMID RAW\n", - " dhis2_routine_submitted_expected <- left_join(\n", - " dhis2_routine_submitted, \n", - " pyramid_open_facilities, \n", - " by = join_by(ADM2_ID, YEAR, MONTH)\n", - " ) \n", - " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 pyramid. \n", - " This method counts the number of OPEN facilities for each ADM2 per MONTH.\")\n", - "}\n", - "\n", - "# Safety measures ...\n", - "dhis2_routine_submitted_expected <- dhis2_routine_submitted_expected |>\n", - " # ungroup() %>% \n", - " mutate(YEAR = as.integer(YEAR),\n", - " MONTH = as.integer(MONTH)\n", - " ) \n", - "\n", - "\n", - "print(dim(dhis2_routine_submitted_expected))\n", - "head(dhis2_routine_submitted_expected, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6fad303c-b239-4cf9-93a8-fe3ce5c33c37", - "metadata": {}, - "outputs": [], - "source": [ - "# --- 3. Calculate `REPORTING_RATE` ------------------------------------------------\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " \n", - "reporting_rate_dataelement <- dhis2_routine_submitted_expected |>\n", - "mutate(\n", - " REPORTING_RATE = SUBMITTED_REPORTS / EXPECTED_REPORTS\n", - " ) \n", - "\n", - "dim(reporting_rate_dataelement)\n", - "head(reporting_rate_dataelement, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68023e8e-f7f6-4201-b097-1996bee57671", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# head(hf_active, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "ae3aa127-c20c-4ca5-af0c-4a4260883cac", - "metadata": {}, - "source": [ - "`#### 🚨 Here 👇 swap denominator: join `dhis2_reporting_expected` to replace `TOTAL_HF` with `EXPECTED_REPORTS``" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a97c7d75-3317-48bc-a2f1-770bf38d141a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " inspect_reporting_rate(reporting_rate_dataelement)\n", - "}\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92651472-26e2-4131-ac02-288122138b0b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# # --- 1. create intermediate df `hf_active_month`: summarize nr of \"active\" (reporting) HF by month ------------------------\n", - "# hf_active_month <- hf_active %>% \n", - "# # filter(ADM1_ID == \"rWrCdr321Qu\") |> # ⚠️⚠️⚠️ TEMP subset just for CODE development ... ! ⚠️⚠️⚠️\n", - "# dplyr::group_by(ADM2_ID, YEAR, MONTH) %>%\n", - "# dplyr::summarize(\n", - "# SUBMITTED_REPORTS = length(which(ACTIVE == TRUE)), # 🚨 GP changed to BOOLEAN to save space\n", - "# .groups = \"drop\") |>\n", - "# mutate(YEAR = as.integer(YEAR), \n", - "# MONTH = as.integer(MONTH)\n", - "# )\n", - "\n", - "# print(dim(hf_active_month))\n", - "# head(hf_active_month)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db5ad094-0601-4a18-9435-db60c1f4e8ff", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - " reporting_rate_dataelement <- reporting_rate_dataelement |> \n", - " select(all_of(fixed_cols_rr))\n", - " \n", - " head(reporting_rate_dataelement, 3)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05f94483-1524-426e-9fe3-4b9bf572c05e", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "f5827740-2917-4504-9017-9ec7d408e5f4", + "metadata": {}, + "source": [ + "Script structure:\n", + "\n", + " 0. Parameters: set back-up values for parameters, for when the notebook is run manually (_noy_ via pipeline)\n", + " 1. Setup:\n", + " * Paths\n", + " * Utils functions\n", + " 2. Load Data\n", + " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", + " * **Reporting** (DHIS2) pre-computed, already formatted & aggregated (output of pipeline ???)\n", + " * **Shapes** (DHIS2) for plotting (this could be removed if we move the plots to \"report/EDA\" nb)\n", + " 3. Calculate **Reportng Rate (RR)**\n", + " * \"**Dataset**\": using pre-computed reportings from DHIS2/SNIS (was: \"DHIS2\")\n", + " * \"**Data Element**\": using calculated expected nr of report (nr of active facilities) (was: \"CONF\")\n", + " 4. **Export** reporting rate data to `.../data/dhis2/reporting_rate/` as .parquet (and .csv) files for **either**:\n", + " * data**set**: \"XXX_reporting_rate_**dataset**.parquet\" **or**\n", + " * data**element**: \"XXX_reporting_rate_**dataelement**.parquet\"" + ] + }, + { + "cell_type": "markdown", + "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", + "metadata": {}, + "source": [ + "--------------------" + ] + }, + { + "cell_type": "markdown", + "id": "e962c5a4-6b09-4485-8d71-d842159118d3", + "metadata": {}, + "source": [ + "### To Do:\n", + "* For `DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\"`: **add code** to count OPEN facilities () for **countries with \"normal\" pyramids** (i.e., when no mixing of facilities and admin levels ... !). Atm only code for Niger, which runs only if `COUNTRY_CODE == NER`. Should add similar (but simpler) code for the rest of the countries (i.e, `COUNTRY_CODE != NER`)\n", + "* Check why Data Element **Denominator** `routine_active_facilities` is **calculated at `YEAR` (aggregated) instead of `MONTH`** ... possibly fix this to match granularity of other alternatives for denominator (which are calculated at MONTH level)\n", + "* Modify **report notebook** and/or pipeline.py code so that it does not make the **pipeline FAIL** if `reporting_rate_dataset` or `reporting_rate_dataelement` is **not found** (which is now always the case since we only output 1 file at each run!!)" + ] + }, + { + "cell_type": "markdown", + "id": "0cdfdc73-bb9a-48a8-a26b-84ecbab2e0aa", + "metadata": {}, + "source": [ + "----------------" + ] + }, + { + "cell_type": "markdown", + "id": "339f6d58-0965-40ef-b718-96195d2463f8", + "metadata": {}, + "source": [ + "## Parameters" + ] + }, + { + "cell_type": "markdown", + "id": "dd6cd6f8-b91b-4902-8801-a60e11776f98", + "metadata": {}, + "source": [ + "Set Default values **if _not_ provided by pipeline**
\n", + "This makes the execution flexible and \"safe\": nb can be run manually from here or be executed via pipeline, without having to change anything in the code!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93aac683-8828-4a42-b841-f16c7e8fbb07", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set BACKUP VALUE: root path - NEVER CHANGE THIS!\n", + "if (!exists(\"SNT_ROOT_PATH\")) {\n", + " SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "}\n", + "\n", + "\n", + "# Choose to run either DataSet OR DataElement method\n", + "if (!exists(\"REPORTING_RATE_METHOD\")) {\n", + " # REPORTING_RATE_METHOD <- \"DATASET\" \n", + " REPORTING_RATE_METHOD <- \"DATAELEMENT\"\n", + "}\n", + "\n", + "\n", + "# Data Elemenet method: Choice of which INDICATORS to use to count the nr of reporting facilities \n", + "# CONF\n", + "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_CONF\")) {\n", + " DATAELEMENT_METHOD_NUMERATOR_CONF <- TRUE # FALSE\n", + "}\n", + "\n", + "# SUSP\n", + "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_SUSP\")) {\n", + " DATAELEMENT_METHOD_NUMERATOR_SUSP <- TRUE # FALSE\n", + "}\n", + "\n", + "# TEST\n", + "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_TEST\")) {\n", + " DATAELEMENT_METHOD_NUMERATOR_TEST <- TRUE # FALSE\n", + "}\n", + "\n", + "\n", + "\n", + "# Data Elemenet RR. Choice: which df to use for nr of `EXPECTED_REPORTS` (DENOMINATOR) \n", + "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {\n", + " # DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\" \n", + " DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\" \n", + " # DATAELEMENT_METHOD_DENOMINATOR <- \"DHIS2_EXPECTED_REPORTS\" # ⚠️ only if `REPORTING_RATE_METHOD == \"DATASET\"` && DataSet is available!! ⚠️\n", + "} \n" + ] + }, + { + "cell_type": "markdown", + "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "markdown", + "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", + "metadata": {}, + "source": [ + "### 1.1. Paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# PROJECT PATHS\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') " + ] + }, + { + "cell_type": "markdown", + "id": "22971de0-1431-4cbd-b8c1-3bd3e1609e0d", + "metadata": {}, + "source": [ + "### 1.2. Utils functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1784fd43-03f3-478b-8148-4b478317ea21", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate.r\"))" + ] + }, + { + "cell_type": "markdown", + "id": "3bbcbd39-54e8-4ece-9244-30d7d30291d2", + "metadata": {}, + "source": [ + "### 1.3. Packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "426ecff6-0b4c-474d-a48d-826002205b89", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List required pcks ----------------> check what are the really required libraries\n", + "required_packages <- c(\"arrow\", # for .parquet\n", + " \"tidyverse\",\n", + " \"stringi\", \n", + " \"jsonlite\", \n", + " \"httr\", \n", + " \"reticulate\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "markdown", + "id": "18a8e0c1-ac09-4435-b6f4-5f91fd916396", + "metadata": {}, + "source": [ + "### 1.3.1. OpenHEXA-specific settings" + ] + }, + { + "cell_type": "markdown", + "id": "ebb8c7d5-7c2c-4dbe-a1ba-238419fbedf3", + "metadata": {}, + "source": [ + "#### For 📦{sf}, tell OH where to find stuff ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91a66fb7-dd5e-43fd-a6a2-d8bb9f0315d6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] + }, + { + "cell_type": "markdown", + "id": "ac9ee427-020e-47c5-b2c9-5ca24e1f2779", + "metadata": {}, + "source": [ + "#### Set environment to load openhexa.sdk from the right path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa331278-573d-4a22-ab16-da6972d7b0be", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set environment to load openhexa.sdk from the right path\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "markdown", + "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", + "metadata": {}, + "source": [ + "### 1.4. Load and check `config` file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "\n", + "config_file_name <- \"SNT_config.json\" \n", + "config_json <- tryCatch({\n", + " jsonlite::fromJSON(file.path(CONFIG_PATH, config_file_name)) \n", + " },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, config_file_name))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "markdown", + "id": "29182f25-b0cf-46aa-9818-49616cd3f353", + "metadata": {}, + "source": [ + "**Save config fields as variables**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c52654c8-8a19-4e0c-a83b-1bc2eecae6bc", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Generic\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", + "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", + "\n", + "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"Ousmane\"\n", + "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", + "\n", + "# Which reporting rate PRODUCT_UID to use (not that this is a dataset in COD, but 2 dataElements in BFA!)\n", + "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "412572bc-fb96-4f61-ac49-be7f449219b6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# DHIS2_INDICATORS\n", + "log_msg(paste(\"Expecting the following DHIS2 (aggregated) indicators : \", paste(DHIS2_INDICATORS, collapse=\", \")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a0a8562-4a70-455c-9ccf-aa39f4cf4e31", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Fixed cols for routine data formatting \n", + "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') # (OU_NAME has homonimous values!)\n", + "# print(paste(\"Fixed routine data (`dhis2_routine`) columns (always expected): \", paste(fixed_cols, collapse=\", \")))\n", + "log_msg(paste(\"Expecting the following columns from routine data (`dhis2_routine`) : \", paste(fixed_cols, collapse=\", \")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86e82d54-2b00-4c25-9b34-3497d4c88c52", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Fixed cols for exporting RR tables: to export output tables with consistent structure\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') " + ] + }, + { + "cell_type": "markdown", + "id": "dadc7351-e67e-450b-a046-bc64660a7dde", + "metadata": {}, + "source": [ + "### 1.5. 🔍 Check: at least 1 indicator must be selected\n", + "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", + "Alternatively, `CONF` could be made mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cf6e2a4-0822-4a0c-852e-143da5473d20", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "nr_of_indicators_selected <- sum(DATAELEMENT_METHOD_NUMERATOR_CONF, DATAELEMENT_METHOD_NUMERATOR_SUSP, DATAELEMENT_METHOD_NUMERATOR_TEST)\n", + "\n", + "if (nr_of_indicators_selected == 0) {\n", + " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method 'Data Element'! Select at least one (e.g., `CONF`).\"\n", + " cat(msg) \n", + " stop(msg)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", + "metadata": {}, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", + "metadata": {}, + "source": [ + "### 2.1. **Routine** data (DHIS2) \n", + "already formatted & aggregated (output of pipeline XXX)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "586e8da8-4e1c-431a-9b8d-1169167e1c09", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# DHIS2 Dataset extract identifier\n", + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "# Load file from dataset\n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "msg <- paste0(\"DHIS2 routine data loaded from dataset : \", dataset_name, \" dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2454183-44f7-4e2e-a0cf-ca112aa183bb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Ensure correct data type for numerical columns \n", + "dhis2_routine <- dhis2_routine %>%\n", + " mutate(across(c(PERIOD, YEAR, MONTH), as.numeric))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edb2fcdc-ce0a-4c78-b06a-9f4610ab4714", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(dhis2_routine, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "821e1ebf-b2fa-4469-974e-2e4d27d58854", + "metadata": {}, + "source": [ + "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", + "Only when: `DATAELEMENT_METHOD_NUMERATOR == \"CONF|SUSP|TEST\"`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3f8b89e-a04e-4e0b-9892-95ce2150e7da", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(dhis2_routine, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "adec5412", + "metadata": {}, + "source": [ + "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", + "Based on which indicator(s) are selected (if any)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bbcdf8c-873a-4b41-980a-f18d1863ab8f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Initialize empty vector\n", + "indicators_selected = c()\n", + "\n", + "# Add elements based on user selection(s)\n", + "if (DATAELEMENT_METHOD_NUMERATOR_CONF) {\n", + " indicators_selected = append(indicators_selected, \"CONF\")\n", + "}\n", + "\n", + "if (DATAELEMENT_METHOD_NUMERATOR_SUSP) {\n", + " indicators_selected = append(indicators_selected, \"SUSP\")\n", + "}\n", + "\n", + "if (DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", + " indicators_selected = append(indicators_selected, \"TEST\")\n", + "}\n", + "\n", + "print(paste0(\"Selected indicators: \", paste(indicators_selected, collapse = \", \")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b84753f8-aa9c-4563-beae-5e29b3f1e773", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# This is kinda useless now but KEEP in case we ADD MORE CHOICES OF INDICATORS!! \n", + "if(REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " if (DATAELEMENT_METHOD_NUMERATOR_CONF | DATAELEMENT_METHOD_NUMERATOR_SUSP | DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", + " log_msg(paste0(\"Indicator(s) \", paste(indicators_selected, collapse = \", \") , \" selected for calculation of numerator for method `Data Element`.\" ))\n", + " \n", + " if ( length(which(indicators_selected %in% names(dhis2_routine))) < length(indicators_selected) ) {\n", + " log_msg(paste0(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: \", paste(expected_col, collapse = \", \"), \".\"), \"warning\")\n", + " } \n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "c832da26-fe0c-43fe-8300-2fff5c4cbf34", + "metadata": {}, + "source": [ + "### 2.2. **Reporting** pre-computed from DHIS2 \n", + "Data granularity:\n", + "* **ADM2**\n", + "* **MONTH** (PERIOD)\n", + "\n", + "Note: data comes from different dataset (`DS_NAME`): `A SERVICES DE BASE`, `B SERVICES SECONDAIRES`,`D SERVICE HOPITAL` \n", + "\n", + "The col `DS_METRIC` indicates whether the `VALUE` is `EXPECTED_REPORTS` or `ACTUAL_REPORTS`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ce295b9-9898-4e12-8a91-92bb25b9e0a2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# REPORTING_RATE_METHOD <- \"DATAELEMENT\" # \"DATASET\"\n", + "REPORTING_RATE_METHOD" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a32fc96-5b8e-4108-a224-c0d843df9b47", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " # DHIS2 Dataset extract identifier\n", + " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + " file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", + " \n", + " # Load file from dataset\n", + " dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 pre-computed REPORTING data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + " \n", + " msg <- paste0(\"DHIS2 pre-computed REPORTING data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). Dataframe dimensions: \", \n", + " paste(dim(dhis2_reporting), collapse=\", \"))\n", + " log_msg(msg)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e131d9ee-0e88-4bb6-982b-53b1229fba5f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " # Convert VALUE col to - should not be needed but keep as safety measure \n", + " dhis2_reporting <- dhis2_reporting |>\n", + " mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", + "\n", + " head(dhis2_reporting, 3)\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46e3dba8-d46b-457e-ba90-c663e30c42d2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # Convert VALUE col to - should not be needed but keep as safety measure \n", + "# dhis2_reporting <- dhis2_reporting |>\n", + "# mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5149befe-b6ad-46a9-9879-7637ce5b02be", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# head(dhis2_reporting, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "7a967af3-f6e5-428a-8769-72808f21a125", + "metadata": {}, + "source": [ + "#### 2.2.1. **Filter** to keep only values for `PRODUCT_UID` defined in config.json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1948c2f7-7a2c-47a2-9dc6-ba29da6d030c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "REPORTING_RATE_PRODUCT_ID" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4258098-e24c-4520-914d-0f73354bb3ab", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + "\n", + " # Handle problems with incorrect configuration - to be improved 🚧\n", + " if (is.null(REPORTING_RATE_PRODUCT_ID)) {\n", + " log_msg(\"🛑 Problem with definition of REPORTING_RATE_PRODUCT_ID, check `SNT_config.json` file!\")\n", + " } else \n", + " product_name <- dhis2_reporting |> filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |> pull(PRODUCT_NAME) |> unique()\n", + " log_msg(glue::glue(\"Using REPORTING_RATE_PRODUCT_ID == `{REPORTING_RATE_PRODUCT_ID}`, corresponding to DHIS2 Product name : `{product_name}`.\"))\n", + "\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c22c6ada-7cb1-4fca-b65e-b51e5eca35a2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + "\n", + " dhis2_reporting_filtered <- dhis2_reporting |>\n", + " filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |>\n", + " select(-PRODUCT_UID, -PRODUCT_NAME) # useless cols now\n", + " \n", + " print(dim(dhis2_reporting_filtered))\n", + " head(dhis2_reporting_filtered)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9da035cf-5d3f-4df0-a063-2d2497616c82", + "metadata": {}, + "source": [ + "#### 2.2.2. Format to produce `dhis2_reporting_expected`\n", + "🚨 Note: Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denominator for REPORTING_RATE calculations (methods dataset and dataelement)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e970f9d-258e-4050-ae69-185b88c79fc3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " \n", + " dhis2_reporting_wide <- dhis2_reporting_filtered |> \n", + " pivot_wider(\n", + " names_from = PRODUCT_METRIC, \n", + " values_from = VALUE\n", + " )\n", + " \n", + " print(dim(dhis2_reporting_wide))\n", + " head(dhis2_reporting_wide)\n", + " \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eab31756-ae6b-4152-8ec3-8195236d8732", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denomitor for RR calculations (methods ANY and CONF)\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " \n", + " dhis2_reporting_expected <- dhis2_reporting_wide |> \n", + " select(-ACTUAL_REPORTS)\n", + " \n", + " print(dim(dhis2_reporting_expected))\n", + " head(dhis2_reporting_expected)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3c3d0f35-889d-4c16-9741-d6e75e2ef096", + "metadata": {}, + "source": [ + "#### 2.2.3. **Checks** on data completeness: _do **periods match** with routine data?_\n", + "Lack of perfect overlap in periods between routine data and reporting rate data might create headhaches downstream!
\n", + "Specifically, **incidence** calculations will show **N2 smaller than N1** due to **aggregation by YEAR when NA** values are present!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ea57600-418b-45bc-805a-f829e237b4c4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " \n", + " # --- Check Year Compatibility ---\n", + " routine_years <- sort(unique(as.integer(dhis2_routine$YEAR))) # as.integer\n", + " expected_years <- sort(unique(as.integer(dhis2_reporting_expected$YEAR))) # as.integer\n", + " \n", + " if (!setequal(routine_years, expected_years)) {\n", + " missing_in_routine <- setdiff(expected_years, routine_years)\n", + " missing_in_expected <- setdiff(routine_years, expected_years)\n", + " \n", + " if (length(missing_in_routine) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_reporting_expected' but not in 'dhis2_routine': \",\n", + " paste(missing_in_routine, collapse = \", \")))\n", + " }\n", + " if (length(missing_in_expected) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_routine' but not in 'dhis2_reporting_expected': \",\n", + " paste(missing_in_expected, collapse = \", \")))\n", + " }\n", + " } else {\n", + " log_msg(\"✅ YEAR values are consistent across 'dhis2_routine' and 'dhis2_reporting_expected'.\")\n", + " \n", + " # --- Check Month Compatibility (if years are consistent) ---\n", + " all_years <- unique(routine_years) # Or expected_years, they are the same now\n", + " \n", + " for (year_val in all_years) {\n", + " routine_months_for_year <- dhis2_routine %>%\n", + " filter(YEAR == year_val) %>%\n", + " pull(MONTH) %>%\n", + " unique() %>%\n", + " sort()\n", + " \n", + " expected_months_for_year <- dhis2_reporting_expected %>%\n", + " filter(YEAR == year_val) %>%\n", + " pull(MONTH) %>%\n", + " unique() %>%\n", + " sort()\n", + " \n", + " if (!setequal(routine_months_for_year, expected_months_for_year)) {\n", + " missing_in_routine_months <- setdiff(expected_months_for_year, routine_months_for_year)\n", + " missing_in_expected_months <- setdiff(routine_months_for_year, expected_months_for_year)\n", + " \n", + " if (length(missing_in_routine_months) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_routine_months, collapse = \", \"),\n", + " \"' present in 'dhis2_reporting_expected' but not in 'dhis2_routine'!\"\n", + " ))\n", + " }\n", + " if (length(missing_in_expected_months) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_expected_months, collapse = \", \"), \n", + " \"' present in 'dhis2_routine' but not in 'dhis2_reporting_expected'!\"\n", + " ))\n", + " }\n", + " } else {\n", + " log_msg(paste0(\"✅ For year \", year_val, \", months are consistent across both data frames.\"))\n", + " }\n", + " }\n", + " }\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "5e711191-995b-4f89-b10c-fc2214cdd8b2", + "metadata": {}, + "source": [ + "### 2.3. **Pyramid** to count OPEN facilities (denominator)\n", + "Table (and column) needed for denominator of \"Data Element\" reporting rate if choice == `PYRAMID_OPEN_FACILITIES`\n", + "\n", + "**Important**: the pyramid must contain the `OPENING_DATE` and `CLOSING_DATE` columns (this was implemented in the new extraction pipeline from 2025-09).
\n", + "Then, **depending on the Country** (well, theire pyramid structure) **import** either:\n", + "* **Raw** pyramid for 🇳🇪 Niger: because first need to \"manually\" correctly aggregate the VALUEs for the HF (separate them from admin levels and sum up HD units)\n", + "* **Formatted** pyramid for all other countries encountered so far: 🇨🇩 DRC, 🇧🇫 Burkina Faso ... bevcause their pyramid is already usable right away" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad3f83b6-2fdd-45da-8a4d-fb06513b6be2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\"\n", + "DATAELEMENT_METHOD_DENOMINATOR" + ] + }, + { + "cell_type": "markdown", + "id": "e7b80b6e-9e34-4e71-93e8-7e16a110e17c", + "metadata": {}, + "source": [ + "#### **Raw** pyramid for 🇳🇪 **Niger**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "652cf1a7-c9a2-48db-b44d-8fabfd0e072f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + " # DHIS2 Dataset extract identifier\n", + " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", + " \n", + " # Load file from dataset\n", + " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", + " dhis2_pyramid_raw <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 pyramid RAW data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + " \n", + " msg <- paste0(\"DHIS2 RAW pyramid data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_raw), collapse=\", \"))\n", + " log_msg(msg)\n", + " \n", + " head(dhis2_pyramid_raw)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "f1716ac3-ce8f-4223-9729-6ed826e743bc", + "metadata": {}, + "source": [ + "#### **Formatted** pyramid for all other countries (normal pyramid) 🇨🇩 🇧🇫" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc16ae54-4915-4333-b458-2b611e2b1792", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", + " \n", + " # DHIS2 Dataset extract identifier\n", + " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + " \n", + " # Load file from dataset\n", + " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", + " dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + " \n", + " msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", + " log_msg(msg)\n", + " \n", + " head(dhis2_pyramid_formatted)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "eb4c5c63-d140-46b8-b686-886e612a31dc", + "metadata": {}, + "source": [ + "## 3. Calculate **Reporting Rate** (RR)\n", + "We compute it using 2 approaches, user can decided later on which one to use for incidence adjustment." + ] + }, + { + "cell_type": "markdown", + "id": "cb724aa8-5f06-4e99-aeca-640d0c1b049e", + "metadata": {}, + "source": [ + "## 3.1. \"**Dataset**\" reporting rate: pre-computed, from **DHIS2**\n", + "Exrtacted from DHIS2 and formatted. \n", + "\n", + "Straightforward: `ACTUAL_REPORTS` / `EXPECTED_REPORTS` (just pivot `DS_METRIC` and divide)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10b2f52b-0217-43f1-88a3-cd01d98869b1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + "\n", + " reporting_rate_dataset <- dhis2_reporting_wide |> \n", + " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", + " \n", + " print(dim(reporting_rate_dataset))\n", + " head(reporting_rate_dataset, 3)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3d49eda8-b4fd-437a-8938-17bf0806f281", + "metadata": {}, + "source": [ + "#### Quick data quality check 🔍" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cff33416-ea66-4eeb-9d33-1597c2f05b0c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# inspect_reporting_rate() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2f4c11c-c683-4204-ab91-9d41cab4826c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " inspect_reporting_rate(reporting_rate_dataset)\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "04870e93-5385-425b-89fd-b815a87cfa21", + "metadata": {}, + "source": [ + "#### Subset cols" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d90671b0-36f8-4c6e-8736-4ea807079f83", + "metadata": { + "scrolled": true, + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " reporting_rate_dataset <- reporting_rate_dataset |> \n", + " select(all_of(fixed_cols_rr))\n", + " \n", + " dim(reporting_rate_dataset)\n", + " head(reporting_rate_dataset, 3)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62e6cb16-0196-447f-b142-aaec2120eecb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "2dc27c07-80cd-465e-891f-9fb70111dbb0", + "metadata": {}, + "source": [ + "#### Plot by MONTH (heatmap)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9ce56fc-f86a-4a2b-95b7-fb6ec5b89087", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", + " \n", + " # Plot reporting rate heatmap\n", + " options(repr.plot.width = 20, repr.plot.height = 10) \n", + " \n", + " # reporting_rate_conf_month %>%\n", + " reporting_rate_dataset %>%\n", + " mutate(\n", + " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", + " ) %>%\n", + " ggplot(., aes(x = DATE, \n", + " y = factor(ADM2_ID), \n", + " fill = REPORTING_RATE * 100)\n", + " ) + \n", + " geom_tile() +\n", + " scale_fill_viridis_c(\n", + " option = \"C\",\n", + " direction = 1, # blue = low, yellow = high\n", + " limits = c(0, 100),\n", + " name = \"Reporting rate (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Monthly Reporting Rate by Health District - Method 'DataSet'\",\n", + " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", + " x = \"Month\",\n", + " y = \"Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 13) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", + " legend.position = \"right\",\n", + " panel.grid = element_blank()\n", + " )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00bf15b7-baa7-4734-8133-8d4a9cc843a3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "40b21c65-1b75-42f7-821a-24d31e436c73", + "metadata": {}, + "source": [ + "----------------------------" + ] + }, + { + "cell_type": "markdown", + "id": "17ffece4-9420-4004-993b-b5692cc1d2de", + "metadata": {}, + "source": [ + "## 3.2. **Data Element** reporting rate: based on reporting of one or more indicators\n", + "**_Partially_ following methods by WHO and as per Diallo (2025) paper**\n", + "\n", + "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single indicator (i.e., **confirmed** malaria case as `CONF`) or for _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", + "\n", + "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", + "\n", + "For this method the user is allowed to **chose** how to calculate both the **numerator** and the **denominator**.
\n", + "Specifically:\n", + "* **Numerator**: is the number of **facilities that _actually reported_** data, and it is estimated based on whether a facility (FoSa, or HF, or `OU_ID`) **submitted data** for **_any_** of the following **indicators**:\n", + " * `CONF`: confirmed malaria cases and/or\n", + " * `SUSP`: suspected malaria cases and/or\n", + " * `TEST`: tested malaria cases
\n", + " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", + "\n", + "
\n", + " \n", + "* **Denominator**: is the number of **facilities _expected_ to report**. This number can be obtained in two different ways:\n", + " * `\"DHIS2_EXPECTED_REPORTS\"`: uses the col `EXPECTED_REPORTS` from the df `dhis2_reporting_expected`.
\n", + " This is obtained directly from DHIS2, and is the same denominator used to calculate the \"Dataset\" reporting rate.\n", + " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", + " This is calculated as the number of \"**active**\" facilities (`OU_ID`), defined as those that submitted _any_ data **at least once in a given year**, across ***all*** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", + "\n", + "
\n", + "\n", + "This method improves over simple binary completeness flags by accounting for both spatial (facility coverage) and temporal (monthly timeliness) dimensions.
" + ] + }, + { + "cell_type": "markdown", + "id": "f5dcd3b9-6f02-4fc5-9e5f-2253c015a3d4", + "metadata": {}, + "source": [ + "### Calculate the **numerator**" + ] + }, + { + "cell_type": "markdown", + "id": "a90d9f4a-a058-4ad5-8ef2-f827987b5def", + "metadata": {}, + "source": [ + "**Note**: the col `REPORTED` keeps the same name regardless of the value of `DATAELEMENT_METHOD_NUMERATOR` because \n", + "in this way the code needs to be parametrized only once (here).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8076609c-46e8-478a-8283-bc63a70102f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "dhis2_routine_active <- dhis2_routine %>%\n", + " mutate(\n", + " # if_any() returns TRUE if the condition is met for any of the selected columns\n", + " ACTIVE = if_else(if_any(all_of(indicators_selected), ~ !is.na(.x)), 1, 0)\n", + " )\n", + "\n", + "log_msg(paste0(\"Evaluating reporting facilities based on indicators: \", paste(indicators_selected, collapse = \", \"), \".\"))\n", + "\n", + "dim(dhis2_routine_active)\n", + "head(dhis2_routine_active, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "325faf35-ed25-4b8e-b421-934a2852f27e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1773313-17e5-478d-b60d-c1193233204d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# --- 1. Calculate `SUBMITTED_REPORTS` as the nr of ACTIVE facilities (that REPORTED, each month) ------------------------\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "dhis2_routine_submitted <- dhis2_routine_active %>% # OLD: dhis2_routine_reporting_month <- dhis2_routine_reporting %>%\n", + " group_by(ADM2_ID, YEAR, MONTH) %>% \n", + " summarise(\n", + " SUBMITTED_REPORTS = sum(ACTIVE, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " ungroup() %>% \n", + " mutate(YEAR = as.integer(YEAR),\n", + " MONTH = as.integer(MONTH)\n", + " ) \n", + "\n", + "print(dim(dhis2_routine_submitted))\n", + "head(dhis2_routine_submitted, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a25647e3-5674-44e0-855e-c3a48483310d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "15f4c12f", + "metadata": {}, + "source": [ + "### Calculate the **denominator**" + ] + }, + { + "cell_type": "markdown", + "id": "06b2070d-c672-425f-a78f-b94a8d16a017", + "metadata": {}, + "source": [ + "#### Option: `ROUTINE_ACTIVE_FACILITIES`\n", + "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`ROUTINE_ACTIVE_FACILITIES`** " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08f03ed1-5831-4fe5-8bde-674a513e8110", + "metadata": { + "scrolled": true, + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate the tot nr of facilities (distinct OU_ID) based on all HF that appear in the routine data (each YEAR)\n", + "# meaning: regardless of what indicators they submit data for, as long as they have submitted something\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", + " routine_active_facilities <- dhis2_routine %>%\n", + " # Keep only rows where at least one indicator has non-NA value\n", + " filter(if_any(any_of(DHIS2_INDICATORS), ~ !is.na(.))) %>%\n", + " group_by(YEAR, ADM2_ID) %>%\n", + " summarize(\n", + " EXPECTED_REPORTS = n_distinct(OU_ID),\n", + " .groups = \"drop\" # remove grouping \n", + " )\n", + "\n", + " nr_of_rows <- nrow(routine_active_facilities)\n", + " log_msg(glue::glue(\"Produced df `routine_active_facilities`, with column `EXPECTED_REPORTS` calculated from DHIS2 routine data. Dataframe `routine_active_facilities` has {nr_of_rows} rows.\"))\n", + "\n", + " head(routine_active_facilities, 3)\n", + " \n", + "} \n" + ] + }, + { + "cell_type": "markdown", + "id": "6629dccb-97b0-4b0e-b23f-15b98704323d", + "metadata": {}, + "source": [ + "#### Option: `PYRAMID_OPEN_FACILITIES`\n", + "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`PYRAMID_OPEN_FACILITIES`** " + ] + }, + { + "cell_type": "markdown", + "id": "0972ffca-c14a-4b93-85ff-027d056c3759", + "metadata": {}, + "source": [ + "------------------" + ] + }, + { + "cell_type": "markdown", + "id": "d49219b7-5932-4062-a10d-e1f3a4a81449", + "metadata": {}, + "source": [ + "#### TEMPORARY! 🇳🇪 **Niger-specific method**\n", + "🚨 Specific to **Niger EnDoP**: Pre-processing needed to separate facilities from adm levels!! 🚨
\n", + "\n", + "⚠️⚠️⚠️ **TEMPORARY: This will be moved to a dedicated pipeline!** ⚠️⚠️⚠️
\n", + "\n", + "Specifically:\n", + "* **Hospital**s (HD a Hopital District): at **level 4** together with Aires de Sante\n", + "* All other **FoSa**s: at **level 6**, also mixed with the hospital units\n", + "\n", + "Therefore, to assigned closed/open status, it is necessary to attach to each individual facility the closng and opening data column. \n", + "To do this: \n", + "1) extract list of facilities and id across the 2 levels (4 and 6) and\n", + "2) calculate the nr of open facilities per MONTH (PERIOD) per ADM2, ending up with a df with cols: `ADM2_ID`, `YEAR`, `MONTH`, `OPEN_FACILITIES_COUNT` = `EXPECTED_REPORTS`\n", + "3) add this to the df with the **numerator** (`dhis2_routine_submitted`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6296329-9bcd-4d2c-afb3-520c6a159cdb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "# names(dhis2_pyramid_raw)\n", + "dim(dhis2_pyramid_raw)\n", + "head(dhis2_pyramid_raw, 3)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "6f651b18-2d85-4e26-8952-45dae9020c40", + "metadata": {}, + "source": [ + "#### 1. Create df with list of all **facilities** with their `DATE_OPENED` and `DATE_CLOSED`: `facility_master`\n", + "Separate \"facilities\" (of any type, such as hospitals to CSI, Infermieres etc) from admin levels and hospital units (wards, depts...)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25dea3c7-44ea-42a7-b467-f470892fcfef", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# is_aire_l5() and is_hospital_l4() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55760b89-f4d0-40c5-9905-f7c7c4fee5c0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List of all FoSa (from Aires → Level 6)\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "fosa_master <- dhis2_pyramid_raw %>%\n", + " filter(is_aire_l5(LEVEL_5_NAME)) %>%\n", + " distinct(\n", + " OU_ID = LEVEL_6_ID,\n", + " OU_NAME = LEVEL_6_NAME,\n", + " region = LEVEL_2_NAME,\n", + " district = LEVEL_3_NAME,\n", + " ADM2_ID = LEVEL_3_ID,\n", + " DATE_OPENED = OPENING_DATE, \n", + " DATE_CLOSED = CLOSED_DATE\n", + " ) %>%\n", + " mutate(OU_TYPE = \"FoSa\")\n", + "\n", + "dim(fosa_master)\n", + "head(fosa_master)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e9c9f59-9c6c-44e4-bbd9-13c3917f5117", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List of all Hospitals (from Level 4, aggregate dates across children)\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "hosp_master <- dhis2_pyramid_raw %>%\n", + "filter(is_hospital_l4(LEVEL_4_NAME)) %>%\n", + "group_by(LEVEL_4_ID, LEVEL_4_NAME, LEVEL_2_NAME, LEVEL_3_NAME, LEVEL_3_ID) %>%\n", + "summarise(\n", + " OPENING_DATE = suppressWarnings(min(OPENING_DATE, na.rm = TRUE)),\n", + " CLOSED_DATE = suppressWarnings(max(CLOSED_DATE, na.rm = TRUE)),\n", + " .groups = \"drop\"\n", + ") %>%\n", + "mutate(\n", + " DATE_OPENED = ifelse(is.infinite(OPENING_DATE), NA, OPENING_DATE) |> as_datetime(),\n", + " DATE_CLOSED = ifelse(is.infinite(CLOSED_DATE), NA, CLOSED_DATE) |> as_datetime()\n", + " ) %>%\n", + "distinct(\n", + " OU_ID = LEVEL_4_ID, \n", + " OU_NAME = LEVEL_4_NAME,\n", + " region=LEVEL_2_NAME,\n", + " district=LEVEL_3_NAME,\n", + " ADM2_ID=LEVEL_3_ID,\n", + " DATE_OPENED,\n", + " DATE_CLOSED\n", + ") %>%\n", + "mutate(\n", + " OU_TYPE = \"Hospital\"\n", + " )\n", + "\n", + "dim(hosp_master)\n", + "head(hosp_master)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5859e393-bfac-46e6-b103-cb8177100860", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Merge both\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "facility_master <- bind_rows(fosa_master, hosp_master) %>% \n", + " select(ADM2_ID, \n", + " OU_ID, \n", + " DATE_OPENED, \n", + " DATE_CLOSED)\n", + "\n", + "dim(facility_master)\n", + "head(facility_master, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "191c3b01-1645-410e-be99-b247bf5f9cfb", + "metadata": {}, + "source": [ + "---------------------" + ] + }, + { + "cell_type": "markdown", + "id": "1ee48156-5ed5-43a5-b927-caa53c10d98e", + "metadata": {}, + "source": [ + "#### **Generic** part: applies to **all countries**" + ] + }, + { + "cell_type": "markdown", + "id": "3aa057a5-6f68-493e-83e2-81bafce42c9e", + "metadata": {}, + "source": [ + "#### 2. Calculate nr of **OPEN facilities** for each `MONTH` per `ADM2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91f972f1-bcc9-458f-9662-5574efc7ac9d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Define start and end period based on routine data \n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "PERIOD_START <- dhis2_routine$PERIOD |> min()\n", + "PERIOD_END <- dhis2_routine$PERIOD |> max()\n", + "\n", + "print(paste0(\"Start period: \", PERIOD_START))\n", + "print(paste0(\"End period :\", PERIOD_END))\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c6ed56f-b2a6-460c-a3dc-6c588c40b54c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create a \"complete\" grid of every month and year for the period range ---------------------------------------\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "months_grid <- tibble(\n", + " month_date = seq(\n", + " ymd(paste0(PERIOD_START, \"01\")), # Converts 202201 to \"20220101\" and then to a date\n", + " ymd(paste0(PERIOD_END, \"01\")), # same\n", + " by = \"months\"\n", + " )\n", + ") %>%\n", + " mutate(\n", + " YEAR = year(month_date),\n", + " MONTH = month(month_date)\n", + " )\n", + "\n", + "dim(months_grid) \n", + "head(months_grid, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb7d9a47-053e-43a5-9e0d-c7b717236f3e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create `facility_master` for any (🚨 non-NER) countries\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", + "\n", + " # Programmatically define `ADM2_ID`\n", + " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", + " # Programmatically define `OU_ID`\n", + " HF_LEVEL <- glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\")\n", + "\n", + " facility_master <- dhis2_pyramid_formatted |>\n", + " mutate(\n", + " DATE_OPENED = with_tz(OPENING_DATE, \"UTC\"),\n", + " DATE_CLOSED = with_tz(CLOSED_DATE, \"UTC\")\n", + " ) |>\n", + " select(\n", + " ADM2_ID = all_of(ADMIN_2_LEVEL), \n", + " OU_ID = all_of(HF_LEVEL),\n", + " DATE_OPENED, #= OPENING_DATE,\n", + " DATE_CLOSED #= CLOSED_DATE\n", + ")\n", + "\n", + "head(facility_master)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f283cc96-ed69-43a3-964e-57ccb0180a4a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create a \"complete\" grid of every ADM2_ID for every month ---------------------------------------\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "# This ensures that even if an ADM2_ID has zero open facilities in a month,\n", + "# it will still appear in the final result with a count of 0.\n", + "complete_grid <- expand_grid(\n", + " ADM2_ID = unique(facility_master$ADM2_ID),\n", + " month_date = months_grid$month_date\n", + ") %>%\n", + " mutate(\n", + " YEAR = year(month_date),\n", + " MONTH = month(month_date),\n", + " month_date = with_tz(as_datetime(month_date), \"UTC\") # GP added 0809\n", + " )\n", + "\n", + "head(complete_grid, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e905c46-036a-4aa9-85ad-216f846f9e1b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Calculate the number of open facilities ---------------------------------------\n", + "\n", + "# # The facility must have opened on or before the last day of the current month. \n", + "# # To calculate the last day: add one month and subtract one day from the first day.\n", + "# complete_grid$month_date[1] # \"2022-01-01\"\n", + "# complete_grid$month_date[1] + months(1) - days(1) # \"2022-01-31\"\n", + "# # The facility must either still be open (DATE_CLOSED is NA) OR it must have closed on or after the first day of that month.\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "open_facilities_count <- facility_master %>%\n", + " # Create a row for every possible combination of facility and month\n", + " crossing(months_grid) %>%\n", + " # A facility is \"open\" if it opened BEFORE the end of the month\n", + " # AND it either never closed (NA) or closed AFTER the start of the month.\n", + " filter(\n", + " DATE_OPENED <= month_date + months(1) - days(1) & # opened on or before the last day of the current month\n", + " (is.na(DATE_CLOSED) | DATE_CLOSED >= month_date) # \n", + " ) %>%\n", + " # Count the number of open facilities for each area and month\n", + " count(ADM2_ID, YEAR, MONTH, name = \"OPEN_FACILITIES_COUNT\")\n", + "\n", + "head(open_facilities_count, 3)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ba854f2-5925-4154-b86e-3e4e7bb6c363", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Join the counts back to the complete grid to include zeros --------------------------------------\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "pyramid_open_facilities <- complete_grid %>%\n", + " left_join(open_facilities_count, by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>%\n", + " # If a month had no open facilities, the count will be NA. Change it to 0.\n", + " # Also rename `OPEN_FACILITIES_COUNT` to `EXPECTED_REPORTS` to use same col name as other methods\n", + " mutate(OPEN_FACILITIES_COUNT = replace_na(OPEN_FACILITIES_COUNT, 0)) %>% # DENOMINATOR: consistent col name across all methods \n", + " select(ADM2_ID, YEAR, MONTH, \n", + " EXPECTED_REPORTS = OPEN_FACILITIES_COUNT) %>%\n", + " arrange(ADM2_ID, YEAR, MONTH)\n", + "\n", + "print(dim(pyramid_open_facilities))\n", + "head(pyramid_open_facilities, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff1a8537-d093-4d5c-8a44-4b729090cced", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "811310a9-df85-4fa3-af9a-b931eaffd7e5", + "metadata": {}, + "source": [ + "### Calculate **Reporting Rate** " + ] + }, + { + "cell_type": "markdown", + "id": "8827cfd6-479b-4025-a379-d20bf20fcfb4", + "metadata": {}, + "source": [ + "**Join df for Denominator**\n", + "\n", + "**Note**
\n", + "in both df's (`dhis2_reporting_expected` OR `routine_active_facilities`) the col `EXPECTED_REPORTS` has the same name to simplify parametrization: only difference between the 2 options is the df to be joined (right element in `left_join()`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "670508a0-3075-4f82-aa2c-d26cf867f13d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# --- 2. Join `dhis2_reporting_expected` OR `dhis2_calculated_expected` to add `EXPECTED_REPORTS` ------------------------------------------------\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "# Parametrized based on DATAELEMENT_METHOD_DENOMINATOR: left_join() the respective df\n", + "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " # Add df of rep rate extracted directly from DHIS2\n", + " dhis2_routine_submitted_expected <- left_join(\n", + " dhis2_routine_submitted, \n", + " dhis2_reporting_expected |> select(ADM2_ID, YEAR, MONTH, EXPECTED_REPORTS), # `dhis2_reporting_expected`\n", + " by = join_by(ADM2_ID, YEAR, MONTH)\n", + " ) \n", + " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` extracted directly from DHIS2.\")\n", + " \n", + "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", + " # Add df of rep rate CALCULATED based on submissiosn in dhis2 routine data \"active\" facilities\n", + " dhis2_routine_submitted_expected <- left_join(\n", + " dhis2_routine_submitted, \n", + " routine_active_facilities, # has only cols: `YEAR`, `ADM2_ID`, `EXPECTED_REPORTS`\n", + " by = join_by(ADM2_ID, YEAR) #, MONTH)\n", + " ) \n", + " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 routine data. Here, ACTIVE facilities \n", + " are defined as facilities that reported on any of the extracted indicators at least once per year.\")\n", + " \n", + "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " # Add df of rep rate CALCULATED based on OPEN facilities as per PYRAMID RAW\n", + " dhis2_routine_submitted_expected <- left_join(\n", + " dhis2_routine_submitted, \n", + " pyramid_open_facilities, \n", + " by = join_by(ADM2_ID, YEAR, MONTH)\n", + " ) \n", + " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 pyramid. \n", + " This method counts the number of OPEN facilities for each ADM2 per MONTH.\")\n", + "}\n", + "\n", + "# Safety measures ...\n", + "dhis2_routine_submitted_expected <- dhis2_routine_submitted_expected |>\n", + " # ungroup() %>% \n", + " mutate(YEAR = as.integer(YEAR),\n", + " MONTH = as.integer(MONTH)\n", + " ) \n", + "\n", + "\n", + "print(dim(dhis2_routine_submitted_expected))\n", + "head(dhis2_routine_submitted_expected, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fad303c-b239-4cf9-93a8-fe3ce5c33c37", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# --- 3. Calculate `REPORTING_RATE` ------------------------------------------------\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " \n", + "reporting_rate_dataelement <- dhis2_routine_submitted_expected |>\n", + "mutate(\n", + " REPORTING_RATE = SUBMITTED_REPORTS / EXPECTED_REPORTS\n", + " ) \n", + "\n", + "dim(reporting_rate_dataelement)\n", + "head(reporting_rate_dataelement, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68023e8e-f7f6-4201-b097-1996bee57671", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# head(hf_active, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "ae3aa127-c20c-4ca5-af0c-4a4260883cac", + "metadata": {}, + "source": [ + "`#### 🚨 Here 👇 swap denominator: join `dhis2_reporting_expected` to replace `TOTAL_HF` with `EXPECTED_REPORTS``" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a97c7d75-3317-48bc-a2f1-770bf38d141a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " inspect_reporting_rate(reporting_rate_dataelement)\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92651472-26e2-4131-ac02-288122138b0b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # --- 1. create intermediate df `hf_active_month`: summarize nr of \"active\" (reporting) HF by month ------------------------\n", + "# hf_active_month <- hf_active %>% \n", + "# # filter(ADM1_ID == \"rWrCdr321Qu\") |> # ⚠️⚠️⚠️ TEMP subset just for CODE development ... ! ⚠️⚠️⚠️\n", + "# dplyr::group_by(ADM2_ID, YEAR, MONTH) %>%\n", + "# dplyr::summarize(\n", + "# SUBMITTED_REPORTS = length(which(ACTIVE == TRUE)), # 🚨 GP changed to BOOLEAN to save space\n", + "# .groups = \"drop\") |>\n", + "# mutate(YEAR = as.integer(YEAR), \n", + "# MONTH = as.integer(MONTH)\n", + "# )\n", + "\n", + "# print(dim(hf_active_month))\n", + "# head(hf_active_month)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db5ad094-0601-4a18-9435-db60c1f4e8ff", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + " reporting_rate_dataelement <- reporting_rate_dataelement |> \n", + " select(all_of(fixed_cols_rr))\n", + " \n", + " head(reporting_rate_dataelement, 3)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05f94483-1524-426e-9fe3-4b9bf572c05e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "73ed8e24-1aab-47af-9d91-5bc4899a40e9", + "metadata": {}, + "source": [ + "`#### Quick data quality check 🔍`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "713a5ed3-2aeb-4949-8ecc-6ee3f787a719", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "# Plot reporting rate heatmap\n", + "options(repr.plot.width = 20, repr.plot.height = 10) \n", + "\n", + "# reporting_rate_conf_month %>%\n", + "reporting_rate_dataelement %>%\n", + "mutate(\n", + " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", + " ) %>%\n", + "ggplot(., aes(x = DATE, \n", + " y = factor(ADM2_ID), \n", + " fill = REPORTING_RATE * 100)\n", + " ) + \n", + " geom_tile() +\n", + " scale_fill_viridis_c(\n", + " option = \"C\",\n", + " direction = 1, # blue = low, yellow = high\n", + " limits = c(0, 100),\n", + " name = \"Reporting rate (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Monthly Reporting Rate by Health District - Method 'DataElement'\",\n", + " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", + " x = \"Month\",\n", + " y = \"Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 13) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", + " legend.position = \"right\",\n", + " panel.grid = element_blank()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93f5b7f0-bf5e-4567-9d16-da2091125988", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "6729e183-5395-4fb7-a535-978c61124710", + "metadata": {}, + "source": [ + "# 4. Export 📁 /data/ folder" + ] + }, + { + "cell_type": "markdown", + "id": "ef68ae41-a0a9-4b45-8b7d-3d1c9b535ad9", + "metadata": {}, + "source": [ + "### 🧹 Clear output directory\n", + "This is needed to ensure that only 2 files are written to the new version of the Dataset:\n", + "* **Data Set** reporting rate (only one way to calculate it, not parametrized as nothing to \"decide\" here)\n", + "* **Data Element** reporting rate: here there are 7 possible combinations of numerator times 3 possible combinatiosn of denominator.
\n", + " These are too many optiosn to give to the incidence pipeline (the step that ingests this data), where these would need to be hardcoded in the pipeline module. When running the incidence pipeline, the user simply choses whether to use `\"dataset\"` or `\"dataelement\"`, and therefore there must be only one file for each option.
\n", + " However, we want to **preserve the info** on the choice of **numerator** and **denominator** in the **filename**. The import function used in incidence therefore only looks for the fixed pattern in the filename, and ignores the tags for numerator and denominator (e.g., \"n-conf-susp-test\", \"d-dexrep\")." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eb882f3-a443-4363-bcad-be5b4ebc7d8f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Cleanup\n", + "path_to_clear <- file.path(DATA_PATH, \"reporting_rate\")\n", + "files_to_delete <- list.files(path_to_clear, full.names = TRUE, recursive = TRUE)\n", + "unlink(files_to_delete, recursive = TRUE)\n", + "log_msg(glue::glue(\"🧹 Deleting all existing files from `{path_to_clear}`. Output of current pipeline run will replace output of previous run.\"))" + ] + }, + { + "cell_type": "markdown", + "id": "1372184e-a1a9-472a-87d4-69e38a1b139d", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### CSV" + ] + }, + { + "cell_type": "markdown", + "id": "c266c99e-a08e-471b-93dd-dbedb4841483", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Build up file name for **data Element** method" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b6532c9-292e-4e8c-8e9d-987867920a6d", + "metadata": { + "papermill": { + "duration": 0.198788, + "end_time": "2025-08-26T09:50:02.770154", + "exception": false, + "start_time": "2025-08-26T09:50:02.571366", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 🚨 Currently not in use! Keeping for future update to method 🚨 (GP 2025-08-29)\n", + "\n", + "# Abbreviation for Data Elememnt chosen NUMERATOR\n", + "method_num = tolower(paste0(\"n-\", paste(indicators_selected, collapse = \"-\")))\n", + "method_num\n", + "\n", + "\n", + "# Abbreviation for Data Elememnt chosen DENOMINATOR\n", + "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " method_den = \"d-dexrep\" # \"d1\"\n", + "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", + " method_den = \"d-actfac\" # \"d2\"\n", + " } else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " method_den = \"d-opnfcl\" # \"d2\"\n", + " }\n", + "\n", + "method_den" + ] + }, + { + "cell_type": "markdown", + "id": "cf5bcd47-dba1-4a7a-81cf-d036fd0ee4db", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Write function to assemble path based on method - for .**csv**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b01c022-aa52-4a1b-a2fe-7bcb145a2049", + "metadata": { + "papermill": { + "duration": 0.108587, + "end_time": "2025-08-26T09:50:02.884462", + "exception": false, + "start_time": "2025-08-26T09:50:02.775875", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# snt_write_csv() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "markdown", + "id": "512ba94e-b7fc-4e45-bdda-8f5533e4e665", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Use function to export .csv files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8689ebc3-d975-45be-92fd-1fedfc733f49", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Dataset\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", + " snt_write_csv(x = reporting_rate_dataset, \n", + " output_data_path = DATA_PATH, \n", + " method = \"dataset\",\n", + " country_code = COUNTRY_CODE) \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b16e74ae-64f2-4267-a6ae-8413c8463af6", + "metadata": { + "papermill": { + "duration": 2.659797, + "end_time": "2025-08-26T09:50:05.545618", + "exception": false, + "start_time": "2025-08-26T09:50:02.885821", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Data Element\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " snt_write_csv(x = reporting_rate_dataelement,\n", + " output_data_path = DATA_PATH, \n", + " method = \"dataelement\",\n", + " country_code = COUNTRY_CODE)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "cfd1679e-dc0e-4805-9420-0788884a7713", + "metadata": { + "papermill": { + "duration": 0.000345, + "end_time": "2025-08-26T09:50:05.546427", + "exception": false, + "start_time": "2025-08-26T09:50:05.546082", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### parquet" + ] + }, + { + "cell_type": "markdown", + "id": "bed7679d-c392-4e3a-9fc7-4d6ae6982517", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Write function to assemble path based on method - for .**parquet**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26e68499-ef55-46f6-a017-8d03fdbff1b4", + "metadata": { + "papermill": { + "duration": 0.100077, + "end_time": "2025-08-26T09:50:05.647079", + "exception": false, + "start_time": "2025-08-26T09:50:05.547002", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# snt_write_parquet() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "markdown", + "id": "8250b998-2669-4590-a4fe-770e42b2d43f", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Use function to export .csv files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52b5720c-864e-49ac-bf40-6b5551214eaa", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Dataset\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", + " snt_write_parquet(x = reporting_rate_dataset,\n", + " output_data_path = DATA_PATH,\n", + " method = \"dataset\",\n", + " country_code = COUNTRY_CODE\n", + " ) \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95bf17f5-6015-464c-9388-df2397d1609c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Data Element\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " snt_write_parquet(x = reporting_rate_dataelement,\n", + " output_data_path = DATA_PATH,\n", + " method = \"dataelement\",\n", + " country_code = COUNTRY_CODE\n", + " )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65b3e4b8-1a62-47c8-877b-1dae4511e4f0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "73ed8e24-1aab-47af-9d91-5bc4899a40e9", - "metadata": {}, - "source": [ - "`#### Quick data quality check 🔍`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "713a5ed3-2aeb-4949-8ecc-6ee3f787a719", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "# Plot reporting rate heatmap\n", - "options(repr.plot.width = 20, repr.plot.height = 10) \n", - "\n", - "# reporting_rate_conf_month %>%\n", - "reporting_rate_dataelement %>%\n", - "mutate(\n", - " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", - " ) %>%\n", - "ggplot(., aes(x = DATE, \n", - " y = factor(ADM2_ID), \n", - " fill = REPORTING_RATE * 100)\n", - " ) + \n", - " geom_tile() +\n", - " scale_fill_viridis_c(\n", - " option = \"C\",\n", - " direction = 1, # blue = low, yellow = high\n", - " limits = c(0, 100),\n", - " name = \"Reporting rate (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Monthly Reporting Rate by Health District - Method 'DataElement'\",\n", - " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", - " x = \"Month\",\n", - " y = \"Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", - " legend.position = \"right\",\n", - " panel.grid = element_blank()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93f5b7f0-bf5e-4567-9d16-da2091125988", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "6729e183-5395-4fb7-a535-978c61124710", - "metadata": {}, - "source": [ - "# 4. Export 📁 /data/ folder" - ] - }, - { - "cell_type": "markdown", - "id": "ef68ae41-a0a9-4b45-8b7d-3d1c9b535ad9", - "metadata": {}, - "source": [ - "### 🧹 Clear output directory\n", - "This is needed to ensure that only 2 files are written to the new version of the Dataset:\n", - "* **Data Set** reporting rate (only one way to calculate it, not parametrized as nothing to \"decide\" here)\n", - "* **Data Element** reporting rate: here there are 7 possible combinations of numerator times 3 possible combinatiosn of denominator.
\n", - " These are too many optiosn to give to the incidence pipeline (the step that ingests this data), where these would need to be hardcoded in the pipeline module. When running the incidence pipeline, the user simply choses whether to use `\"dataset\"` or `\"dataelement\"`, and therefore there must be only one file for each option.
\n", - " However, we want to **preserve the info** on the choice of **numerator** and **denominator** in the **filename**. The import function used in incidence therefore only looks for the fixed pattern in the filename, and ignores the tags for numerator and denominator (e.g., \"n-conf-susp-test\", \"d-dexrep\")." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7eb882f3-a443-4363-bcad-be5b4ebc7d8f", - "metadata": {}, - "outputs": [], - "source": [ - "# Cleanup\n", - "path_to_clear <- file.path(DATA_PATH, \"reporting_rate\")\n", - "files_to_delete <- list.files(path_to_clear, full.names = TRUE, recursive = TRUE)\n", - "unlink(files_to_delete, recursive = TRUE)\n", - "log_msg(glue::glue(\"🧹 Deleting all existing files from `{path_to_clear}`. Output of current pipeline run will replace output of previous run.\"))" - ] - }, - { - "cell_type": "markdown", - "id": "1372184e-a1a9-472a-87d4-69e38a1b139d", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### CSV" - ] - }, - { - "cell_type": "markdown", - "id": "c266c99e-a08e-471b-93dd-dbedb4841483", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Build up file name for **data Element** method" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b6532c9-292e-4e8c-8e9d-987867920a6d", - "metadata": { - "papermill": { - "duration": 0.198788, - "end_time": "2025-08-26T09:50:02.770154", - "exception": false, - "start_time": "2025-08-26T09:50:02.571366", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# 🚨 Currently not in use! Keeping for future update to method 🚨 (GP 2025-08-29)\n", - "\n", - "# Abbreviation for Data Elememnt chosen NUMERATOR\n", - "method_num = tolower(paste0(\"n-\", paste(indicators_selected, collapse = \"-\")))\n", - "method_num\n", - "\n", - "\n", - "# Abbreviation for Data Elememnt chosen DENOMINATOR\n", - "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " method_den = \"d-dexrep\" # \"d1\"\n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", - " method_den = \"d-actfac\" # \"d2\"\n", - " } else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " method_den = \"d-opnfcl\" # \"d2\"\n", - " }\n", - "\n", - "method_den" - ] - }, - { - "cell_type": "markdown", - "id": "cf5bcd47-dba1-4a7a-81cf-d036fd0ee4db", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Write function to assemble path based on method - for .**csv**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b01c022-aa52-4a1b-a2fe-7bcb145a2049", - "metadata": { - "papermill": { - "duration": 0.108587, - "end_time": "2025-08-26T09:50:02.884462", - "exception": false, - "start_time": "2025-08-26T09:50:02.775875", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# write function\n", - "snt_write_csv <- function(x, output_data_path, method) {\n", - " \n", - " full_directory_path <- file.path(output_data_path, \"reporting_rate\")\n", - " \n", - " if (!dir.exists(full_directory_path)) {\n", - " dir.create(full_directory_path, recursive = TRUE)\n", - " }\n", - "\n", - " file_path <- file.path(full_directory_path, paste0(COUNTRY_CODE, \"_reporting_rate_\", method, \".csv\")) \n", - " \n", - " write_csv(x, file_path)\n", - "\n", - " log_msg(paste0(\"Exported : \", file_path))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "512ba94e-b7fc-4e45-bdda-8f5533e4e665", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Use function to export .csv files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8689ebc3-d975-45be-92fd-1fedfc733f49", - "metadata": {}, - "outputs": [], - "source": [ - "# Method \"Dataset\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", - " snt_write_csv(x = reporting_rate_dataset, \n", - " output_data_path = DATA_PATH, \n", - " method = \"dataset\") \n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b16e74ae-64f2-4267-a6ae-8413c8463af6", - "metadata": { - "papermill": { - "duration": 2.659797, - "end_time": "2025-08-26T09:50:05.545618", - "exception": false, - "start_time": "2025-08-26T09:50:02.885821", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Method \"Data Element\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " snt_write_csv(x = reporting_rate_dataelement,\n", - " output_data_path = DATA_PATH, \n", - " method = \"dataelement\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "cfd1679e-dc0e-4805-9420-0788884a7713", - "metadata": { - "papermill": { - "duration": 0.000345, - "end_time": "2025-08-26T09:50:05.546427", - "exception": false, - "start_time": "2025-08-26T09:50:05.546082", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### parquet" - ] - }, - { - "cell_type": "markdown", - "id": "bed7679d-c392-4e3a-9fc7-4d6ae6982517", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Write function to assemble path based on method - for .**parquet**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26e68499-ef55-46f6-a017-8d03fdbff1b4", - "metadata": { - "papermill": { - "duration": 0.100077, - "end_time": "2025-08-26T09:50:05.647079", - "exception": false, - "start_time": "2025-08-26T09:50:05.547002", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# write function\n", - "snt_write_parquet <- function(x, output_data_path, method) {\n", - " \n", - " full_directory_path <- file.path(output_data_path, \"reporting_rate\")\n", - " \n", - " if (!dir.exists(full_directory_path)) {\n", - " dir.create(full_directory_path, recursive = TRUE)\n", - " }\n", - "\n", - " file_path <- file.path(full_directory_path, paste0(COUNTRY_CODE, \"_reporting_rate_\", method, \".parquet\")) \n", - " \n", - " arrow::write_parquet(x, file_path)\n", - "\n", - " log_msg(paste0(\"Exported : \", file_path))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "8250b998-2669-4590-a4fe-770e42b2d43f", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Use function to export .csv files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52b5720c-864e-49ac-bf40-6b5551214eaa", - "metadata": {}, - "outputs": [], - "source": [ - "# Method \"Dataset\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", - " snt_write_parquet(x = reporting_rate_dataset,\n", - " output_data_path = DATA_PATH,\n", - " method = \"dataset\"\n", - " ) \n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95bf17f5-6015-464c-9388-df2397d1609c", - "metadata": {}, - "outputs": [], - "source": [ - "# Method \"Data Element\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " snt_write_parquet(x = reporting_rate_dataelement,\n", - " output_data_path = DATA_PATH,\n", - " method = \"dataelement\"\n", - " )\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65b3e4b8-1a62-47c8-877b-1dae4511e4f0", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb b/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb index 27d2cd2..65073fc 100644 --- a/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb @@ -1,998 +1,1113 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "ad4e51fd-ab5a-478c-856a-bc3308ce5781", - "metadata": {}, - "source": [ - "-------------\n", - "🤌🏼 Points to discuss:\n", - "* **what we do want to plot here?**
\n", - " Plot only what is produced by the pipeline (hence reflect choice of parameters from pipeline run) OR all the possible options (all output produced by all pipelines run so far, meaning whatever is writte the to most recent version of the Dataset?)\n", - "* **how to handle missing files?**: namely, situations in which files are not yet been produced. In this reporting rate case, if the user only runs the pipeline to produce the \"Dataset\" reporrting rate file,, then we cannot plot anything for the \"Data Element\" reporting rate as there is no file yet ...\n", - " Atm this is handled with `if` logic, but should be made more elegant to avoid repeating the same code twice (for dataset and for dataelement)\n", - "\n", - "-------------\n", - "\n", - "🚧 To do:\n", - "* **Plots shouls be wrapped as functions (DRY code)**! Cuold save in .R file in this same location to `source()` only here (as plots are specifc to this notebook, no need to save in snt_utils.R)\n", - "* **Display _real_ data**: do **_not_ cap** reporting rate values at 1 (100%)!! It's important to visualize real full range if we want to qualitatively assess and compare different methods!\n", - "* **fix object names**: `routine_data` is NOT routine data ... !!\n", - "* When importing `reporting_rate_data`, try if possible to avoid using `tryCatch`, and use `log_msg(..., \"warning\")` instead (should simplify code and logic ... ). Idea is to **log a meaningful warning without making the pipeline fail** just becauase a file in the report nb is missing ... !\n", - "\n", - "-------------" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "ad4e51fd-ab5a-478c-856a-bc3308ce5781", + "metadata": {}, + "source": [ + "-------------\n", + "🤌🏼 Points to discuss:\n", + "* **what we do want to plot here?**
\n", + " Plot only what is produced by the pipeline (hence reflect choice of parameters from pipeline run) OR all the possible options (all output produced by all pipelines run so far, meaning whatever is writte the to most recent version of the Dataset?)\n", + "* **how to handle missing files?**: namely, situations in which files are not yet been produced. In this reporting rate case, if the user only runs the pipeline to produce the \"Dataset\" reporrting rate file,, then we cannot plot anything for the \"Data Element\" reporting rate as there is no file yet ...\n", + " Atm this is handled with `if` logic, but should be made more elegant to avoid repeating the same code twice (for dataset and for dataelement)\n", + "\n", + "-------------\n", + "\n", + "🚧 To do:\n", + "* **Plots shouls be wrapped as functions (DRY code)**! Cuold save in .R file in this same location to `source()` only here (as plots are specifc to this notebook, no need to save in snt_utils.R)\n", + "* **Display _real_ data**: do **_not_ cap** reporting rate values at 1 (100%)!! It's important to visualize real full range if we want to qualitatively assess and compare different methods!\n", + "* **fix object names**: `routine_data` is NOT routine data ... !!\n", + "* When importing `reporting_rate_data`, try if possible to avoid using `tryCatch`, and use `log_msg(..., \"warning\")` instead (should simplify code and logic ... ). Idea is to **log a meaningful warning without making the pipeline fail** just becauase a file in the report nb is missing ... !\n", + "\n", + "-------------" + ] + }, + { + "cell_type": "markdown", + "id": "80fa8c3c-ed62-4248-8149-ffe2974a7206", + "metadata": {}, + "source": [ + "# Taux de Rapportage des Formations Sanitaires - Health Facility Reporting Rates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35bc4c99-5e5c-44dc-8c67-7f38eaec708e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate\")\n", + "\n", + "# load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate.r\"))\n", + "\n", + "# List required packages \n", + "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"leaflet\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)\n", + "\n", + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "# Required environment for the sf packages\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd297a84-5a55-4374-9d2b-3148fde8072d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration variables\n", + "DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa63eb27-746f-420b-87ad-da82139acff9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# printdim() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a603fdb-e3ae-4aa3-a908-0385ae216d49", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# import DHIS2 shapes data\n", + "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9656abb7-0085-4feb-974c-fb0b1c68c38f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# import pyramid data\n", + "pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "# Select distinct (already done in SNT format pipeline)\n", + "ADMIN_1_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), \"NAME\", \"ID\")\n", + "ADMIN_2_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), \"NAME\", \"ID\")\n", + "\n", + "pyramid_data <- pyramid_data %>%\n", + " distinct(across(all_of(c(ADMIN_1_ID, ADMIN_2_ID))), .keep_all = TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e912503-5c57-4997-8c68-da673bd14626", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "print(dim(pyramid_data))\n", + "head(pyramid_data)" + ] + }, + { + "cell_type": "markdown", + "id": "78ec55d0-3a0d-413d-97cd-303895275f88", + "metadata": {}, + "source": [ + "## A) Taux de Soumission des Rapports / Dataset Reporting Rate\n", + "\n", + "**[FR]**\n", + "Cette section analyse le **taux de soumission des rapports**, tel que calculé dans le Système National d’Information Sanitaire (SNIS). Ce taux est défini comme le nombre de rapports effectivement reçus (rapports actuels) divisé par le nombre de rapports attendus (rapports attendus) sur une période donnée. Les rapports attendus correspondent au nombre de formations sanitaires qui, selon les paramètres du SNIS, devaient soumettre un rapport. Cet indicateur permet d’évaluer si les structures ont transmis les rapports requis, sans tenir compte du contenu ou de l’exhaustivité des données saisies.\n", + "\n", + "**[EN]**\n", + "This section analyzes the **dataset reporting rate**, as calculated in the Health Management Information System (HMIS). The rate is defined as the number of reports actually submitted (actual reports) divided by the number of reports expected (expected reports) over a given period. Expected reports refer to the number of health facilities that were required to report according to SNIS configuration. This indicator helps assess whether health facilities submitted their required reports, regardless of the content or completeness of the data within those reports." + ] + }, + { + "cell_type": "markdown", + "id": "793a685b-a5cc-4e12-9c78-e548beffa213", + "metadata": {}, + "source": [ + "**Question:** Can the reporting rate file be loaded only once (using a parameter see below)? -> if that's the case, we can remove specific plotting codes for the \"dataset\" and \"dataelement\" files and just keep one \"plotting code\" for both types \n", + "\n", + "**Suggestion (link to previous Question):** The file name can be parameterized by injecting the user selection via parameters={...} from the OpenHexa pipeline. \n", + "\n", + "> paste0(COUNTRY_CODE, \"_reporting_rate_\", REPORTING_RATE_METHOD ,\".parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2d9d428-c5a7-4f35-8a21-8f22adaa6a26", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Import from Dataset\n", + "reporting_rate_data <- tryCatch({\n", + " # Attempt to load the dataset\n", + " get_latest_dataset_file_in_memory(\n", + " DATASET_NAME, \n", + " paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", + " )\n", + " }, \n", + " error = function(e) {\n", + " # If an error occurs, log a warning\n", + " # msg <- paste(\"[WARNING] Warning: Could not load reporting rate file for:\", COUNTRY_CODE, \". Proceeding with empty data. Error:\", conditionMessage(e))\n", + " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataset.parquet` does not exist, skipped loading. \n", + " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", + " log_msg(msg, level = \"warning\")\n", + " \n", + " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", + " return(\n", + " tibble(\n", + " YEAR = double(),\n", + " MONTH = double(),\n", + " ADM2_ID = character(),\n", + " REPORTING_RATE = double()\n", + " )\n", + " )\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d13dea1-e204-4c27-8a44-14e260bcdad1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add _NAME cols from pyramid\n", + "if (nrow(reporting_rate_data) != 0) {\n", + " \n", + " ADMIN_2_ID <- str_replace(ADM_2, \"NAME\", \"ID\") \n", + " reporting_rate_data <- reporting_rate_data %>% \n", + " left_join(pyramid_data[c(ADM_2, ADMIN_2_ID)], by = c(\"ADM2_ID\" = ADMIN_2_ID))\n", + " \n", + " colnames(reporting_rate_data)[colnames(reporting_rate_data) == ADM_2] <- \"ADM2_NAME\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b96b0b12-e168-421f-b0a9-76e83c48842c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(reporting_rate_data, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "64d787c8-25d3-4b2c-87e9-6d35b206b018", + "metadata": {}, + "source": [ + "**fix:** \n", + " - Just replaced this line with the variable \"ADM2_NAME\" : \n", + "> Plot heatmap \n", + "> options(repr.plot.width = 18, repr.plot.height = 15) \n", + "> ggplot(reporting_rate_data, aes(x = date, y = **ADM2_NAME**, fill = category)) + " + ] + }, + { + "cell_type": "markdown", + "id": "c13f15a4-2788-4d57-9edc-78d0afdbe278", + "metadata": {}, + "source": [ + "### Plot: Heatmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb3b565e-8497-4d88-8d6d-ae6b3e2929b2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + " \n", + " # Prepare date column + category\n", + " reporting_rate_data <- reporting_rate_data %>%\n", + " mutate(\n", + " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", + " ADM2_ID = factor(ADM2_ID),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100, # `pmin()` caps to 100%\n", + " reporting_pct = REPORTING_RATE * 100,\n", + " category = cut(\n", + " reporting_pct,\n", + " # breaks = c(-Inf, 50, 80, 90, Inf),\n", + " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", + " # GP 2025-08-07 added this, but double check (seems too many >100!!)\n", + " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", + " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", + " right = TRUE # FALSE: intervals are left-closed: lower bound is included\n", + " )\n", + " )\n", + " \n", + " # Define color scale\n", + " reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50–80\" = \"#fdae61\", # orange\n", + " \"80–90\" = \"#ffffbf\", # yellow\n", + " \"90-100\" = \"#1a9641\", # green\n", + " \">100\" = \"darkgreen\"\n", + " )\n", + " \n", + " # Plot heatmap\n", + " options(repr.plot.width = 18, repr.plot.height = 15)\n", + " ggplot(reporting_rate_data, aes(x = date, y = ADM2_NAME, fill = category)) + # -> Using a ADM2_NAME Variable to select the column !!\n", + " geom_tile() +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Taux de soumission des rapports mensuels par district sanitaire\",\n", + " subtitle = \"Monthly Dataset Reporting Rate by Health District\",\n", + " x = \"Mois - Month\",\n", + " y = \"District Sanitaire - Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", + " # legend.position = \"right\",\n", + " legend.position = \"top\",\n", + " panel.grid = element_blank()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28662f31-8aa9-4f83-8dd6-8eb489723652", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Prepare the data\n", + "reporting_rate_data_box <- reporting_rate_data %>%\n", + " mutate(\n", + " MONTH = as.integer(MONTH),\n", + " YEAR = as.factor(YEAR),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100\n", + " reporting_pct = REPORTING_RATE * 100\n", + " )\n", + "\n", + "# Month labels in French\n", + "month_labels_fr <- c(\n", + " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", + " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", + ")\n", + "\n", + "# Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 15)\n", + "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", + " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", + " scale_x_discrete(labels = month_labels_fr) +\n", + " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", + " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", + " labs(\n", + " title = \"Distribution mensuelle du taux de soumission des rapports\",\n", + " subtitle = \"Monthly Distribution of Dataset Reporting Rate by Health District (2021–2024)\",\n", + " x = \"Mois\",\n", + " fill = \"Année\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " plot.subtitle = element_text(size = 16),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fd09749-2042-49d4-b2a3-9c2e6e5eae52", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Aggregate to annual reporting rate per district\n", + "annual_data <- reporting_rate_data %>%\n", + " group_by(YEAR, ADM2_ID) %>%\n", + " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", + " ungroup()\n", + "\n", + "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", + "map_data <- shapes_data %>%\n", + " left_join(annual_data, by = \"ADM2_ID\")\n", + "\n", + "# Step 3: Bin the reporting rate into categories\n", + "map_data <- map_data %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-79\", # \"50-80\"\n", + " reporting_rate < 0.9 ~ \"80-89\", # \"80-90\"\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " ),\n", + " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-79\", \"80-89\", \">=90\")) # levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", + " )\n", + "\n", + "# Step 4: Define colors\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\",\n", + " \"50-79\" = \"#fdae61\",\n", + " \"80-89\" = \"#ffffbf\",\n", + " \">=90\" = \"#1a9641\"\n", + ")\n", + "\n", + "# Step 5: Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 10)\n", + "ggplot(map_data) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " facet_wrap(~ YEAR) +\n", + " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", + " labs(\n", + " title = \"Taux de soumission des rapports annuels par district sanitaire\",\n", + " subtitle = \"Annual Dataset Reporting Completeness by Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " strip.text = element_text(face = \"bold\", size = 16),\n", + " plot.title = element_text(face = \"bold\")\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df3f7949-baff-4cd9-a022-594420765289", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", + "mean_reporting_stats <- map_data %>%\n", + " group_by(ADM2_ID) %>%\n", + " summarise(\n", + " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-80\",\n", + " reporting_rate < 0.9 ~ \"80-90\",\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " )\n", + " )\n", + "\n", + "# Set correct factor levels to match legend\n", + "mean_reporting_stats$reporting_cat <- factor(\n", + " mean_reporting_stats$reporting_cat,\n", + " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", + ")\n", + "\n", + "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", + "mean_reporting_map <- shapes_data %>%\n", + " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", + " st_as_sf()\n", + "\n", + "# Step 3: Define custom color scale\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50-80\" = \"#fdae61\", # orange\n", + " \"80-90\" = \"#ffffbf\", # yellow\n", + " \">=90\" = \"#1a9641\" # green\n", + ")\n", + "\n", + "# Step 4: Plot\n", + "options(repr.plot.width = 20, repr.plot.height = 10)\n", + "ggplot(mean_reporting_map) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\",\n", + " drop = FALSE\n", + " ) +\n", + " labs(\n", + " title = \"Taux moyen de soumission des rapports (toutes années confondues)\",\n", + " subtitle = \"Mean Annual Dataset Reporting Rate (All Years Combined)\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9497287c-bbd2-446f-946d-88e34233f9f0", + "metadata": {}, + "source": [ + "## B) Taux de rapportage des éléments de données: cas confirmés / Data element Reporting Rate: confirmed cases\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f8cd01e-e4b5-41f5-bff8-d35a1b143d0e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # import data\n", + "# # was: routine_data\n", + "# reporting_rate_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")) }, \n", + "# error = function(e) {\n", + "# msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", + "# # cat(msg)\n", + "# log_msg(msg, level = \"warning\") # GP 20250908\n", + "# # stop(msg) # GP 20250908\n", + "# })\n", + "\n", + "# reporting_rate_data <- reporting_rate_data %>%\n", + "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", + "\n", + "# printdim(reporting_rate_data)" + ] + }, + { + "cell_type": "markdown", + "id": "8c11349d-598a-4882-a156-3e5b969ab76c", + "metadata": {}, + "source": [ + "### Import and format data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7838692f-fe89-446e-bac5-af5cc7324226", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f8eba51-b883-432b-8f7c-c4860cc9e78c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "# ADMIN_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceeba43a-e9ba-4b5a-8d0b-c4faade1367e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", + "# ADMIN_2_LEVEL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54bf61aa-c324-411f-8eea-93049d1bb252", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# THIS CODE SHOULD BE REMOVED, WE SHOULD ONLY LOAD REPORTING RATE ONCE IN THIS REPORT (parameter " + ] + }, + { + "cell_type": "markdown", + "id": "2338477e-2036-42e6-bfd3-c2b2480395c1", + "metadata": {}, + "source": [ + "**suggestion:** \n", + "- If possible I would try to reuse the same plotting code. So we can remove all the code as from here ...**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41da51da-7c8a-45d2-b492-a80071dfe2e3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Import from Dataset\n", + "\n", + "reporting_rate_data <- tryCatch({\n", + " # Attempt to load the dataset\n", + " get_latest_dataset_file_in_memory(\n", + " DATASET_NAME, \n", + " paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", + " )\n", + "}, \n", + "error = function(e) {\n", + " # If an error occurs, log a warning\n", + " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataelement.parquet` does not exist, skipped loading. \n", + " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", + " log_msg(msg, level = \"warning\")\n", + " \n", + " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", + " return(\n", + " tibble(\n", + " YEAR = double(),\n", + " MONTH = double(),\n", + " ADM2_ID = character(),\n", + " REPORTING_RATE = double()\n", + " )\n", + " )\n", + "})\n", + "\n", + "# Add _NAME cols from pyramid\n", + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + " ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", + " \n", + " reporting_rate_data <- reporting_rate_data %>%\n", + " # left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\")) # old\n", + " left_join(pyramid_data, by = c(\"ADM2_ID\" = ADMIN_2_LEVEL))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dff04b9c-0ee7-44d3-a84f-150bce1c368f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# reporting_rate_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0be95b4e-8678-44be-a361-d2216bcd741c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# if (nrow(reporting_rate_data) != 0) {\n", + "# reporting_rate_data <- reporting_rate_data %>%\n", + "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", + "# }" + ] + }, + { + "cell_type": "markdown", + "id": "42155cf0-0475-45fc-a276-7ff2bd4ed555", + "metadata": {}, + "source": [ + "### Plot: Heatmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d4d40b6-7d01-4ed3-83ba-39ffdfbe4b3d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Prepare date column + category\n", + "\n", + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "reporting_rate_data <- reporting_rate_data %>%\n", + " mutate(\n", + " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", + " ADM2_ID = factor(ADM2_ID),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100,\n", + " reporting_pct = REPORTING_RATE * 100,\n", + " category = cut(\n", + " reporting_pct,\n", + " # breaks = c(-Inf, 50, 80, 90, Inf),\n", + " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", + " # right = FALSE\n", + " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", + " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", + " right = TRUE\n", + " )\n", + " )\n", + "\n", + "# # Define color scale\n", + "# reporting_colors <- c(\n", + "# \"<50\" = \"#d7191c\", # red\n", + "# \"50–80\" = \"#fdae61\", # orange\n", + "# \"80–90\" = \"#ffffbf\", # yellow\n", + "# \"≥90\" = \"#1a9641\" # green\n", + "# )\n", + "\n", + "# Define color scale\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50–80\" = \"#fdae61\", # orange\n", + " \"80–90\" = \"#ffffbf\", # yellow\n", + " \"90-100\" = \"#1a9641\", # green\n", + " \">100\" = \"darkgreen\" # \"darkgreen\"\n", + ")\n", + "\n", + "# Plot heatmap\n", + "options(repr.plot.width = 18, repr.plot.height = 15)\n", + "ggplot(reporting_rate_data, aes(x = date, y = LEVEL_3_NAME, fill = category)) +\n", + " geom_tile() +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Taux de rapportage mensuels par district sanitaire\",\n", + " subtitle = \"Monthly Data Element Reporting Rate by Health District\",\n", + " x = \"Mois - Month\",\n", + " y = \"District Sanitaire - Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", + " legend.position = \"top\", # \"right\"\n", + " panel.grid = element_blank()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "b14436cf-1f78-4c3f-944b-0c1eb845a3f6", + "metadata": {}, + "source": [ + "### Plot: boxplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba3a3e89-17f5-4024-b039-dcedb0f37dc2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Prepare the data\n", + "\n", + "if (nrow(reporting_rate_data) != 0) {\n", + " \n", + "reporting_rate_data_box <- reporting_rate_data %>%\n", + " mutate(\n", + " MONTH = as.integer(MONTH),\n", + " YEAR = as.factor(YEAR),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100 # `pmin()` caps values to 1 (then, 100%)\n", + " reporting_pct = REPORTING_RATE * 100\n", + " )\n", + "\n", + "# Month labels in French\n", + "month_labels_fr <- c(\n", + " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", + " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", + ")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1472d8f2-56d3-4b7a-82ef-4b344d87d264", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 15)\n", + "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", + " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", + " scale_x_discrete(labels = month_labels_fr) +\n", + " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", + " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", + " labs(\n", + " title = \"Distribution mensuelle du taux de rapportage\",\n", + " subtitle = \"Monthly Distribution of Data Element Reporting Rate by Health District (2021–2024)\",\n", + " x = \"Mois\",\n", + " fill = \"Année\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " plot.subtitle = element_text(size = 16),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "0d275753-d21e-410a-b7d7-f265ac6e9235", + "metadata": {}, + "source": [ + "### Plot: choropleth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12674c7e-7745-464c-9cc5-3c1e6dcd63c4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Aggregate to annual reporting rate per district \n", + "annual_data <- reporting_rate_data %>%\n", + " group_by(YEAR, ADM2_ID) %>%\n", + " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", + " ungroup()\n", + "\n", + "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", + "map_data <- shapes_data %>%\n", + " left_join(annual_data, by = \"ADM2_ID\")\n", + "\n", + "# Step 3: Bin the reporting rate into categories\n", + "map_data <- map_data %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-80\",\n", + " reporting_rate < 0.9 ~ \"80-90\",\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " ),\n", + " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\"))\n", + " )\n", + "\n", + "# Step 4: Define colors\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\",\n", + " \"50-80\" = \"#fdae61\",\n", + " \"80-90\" = \"#ffffbf\",\n", + " \">=90\" = \"#1a9641\"\n", + ")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7777a718-2e6c-4f80-a7d6-ae304a1b49fb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 5: Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 10)\n", + "ggplot(map_data) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " facet_wrap(~ YEAR) +\n", + " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", + " labs(\n", + " title = \"Taux de rapportage des éléments de donnée annuels par district sanitaire\",\n", + " subtitle = \"Annual Data element Reporting Completeness by Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " strip.text = element_text(face = \"bold\", size = 16),\n", + " plot.title = element_text(face = \"bold\")\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "f25ec0fb-758b-476c-8567-b0dce0a387d1", + "metadata": {}, + "source": [ + "### Plot: choropleth 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c891e330-f56f-4846-88c3-fd13a9fac8e7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", + "mean_reporting_stats <- map_data %>%\n", + " group_by(ADM2_ID) %>%\n", + " summarise(\n", + " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-80\",\n", + " reporting_rate < 0.9 ~ \"80-90\",\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " )\n", + " )\n", + "\n", + "# Set correct factor levels to match legend\n", + "mean_reporting_stats$reporting_cat <- factor(\n", + " mean_reporting_stats$reporting_cat,\n", + " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", + ")\n", + "\n", + "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", + "mean_reporting_map <- shapes_data %>%\n", + " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", + " st_as_sf()\n", + "\n", + "# Step 3: Define custom color scale\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50-80\" = \"#fdae61\", # orange\n", + " \"80-90\" = \"#ffffbf\", # yellow\n", + " \">=90\" = \"#1a9641\" # green\n", + ")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4032f2be-dc1e-48cf-9a5f-5c0854c32e9a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 4: Plot\n", + "options(repr.plot.width = 20, repr.plot.height = 10)\n", + "ggplot(mean_reporting_map) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\",\n", + " drop = FALSE\n", + " ) +\n", + " labs(\n", + " title = \"Taux moyen de rapportage (toutes années confondues)\",\n", + " subtitle = \"Mean Annual Data Element Reporting Rate (All Years Combined)\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0f1ea54-b02b-4523-b8b7-9dcdf30c39ba", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "80fa8c3c-ed62-4248-8149-ffe2974a7206", - "metadata": {}, - "source": [ - "# Taux de Rapportage des Formations Sanitaires - Health Facility Reporting Rates" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35bc4c99-5e5c-44dc-8c67-7f38eaec708e", - "metadata": {}, - "outputs": [], - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"leaflet\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "# Required environment for the sf packages\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd297a84-5a55-4374-9d2b-3148fde8072d", - "metadata": {}, - "outputs": [], - "source": [ - "# Configuration variables\n", - "DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa63eb27-746f-420b-87ad-da82139acff9", - "metadata": {}, - "outputs": [], - "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a603fdb-e3ae-4aa3-a908-0385ae216d49", - "metadata": {}, - "outputs": [], - "source": [ - "# import DHIS2 shapes data\n", - "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9656abb7-0085-4feb-974c-fb0b1c68c38f", - "metadata": {}, - "outputs": [], - "source": [ - "# import pyramid data\n", - "pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "# Select distinct (already done in SNT format pipeline)\n", - "ADMIN_1_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), \"NAME\", \"ID\")\n", - "ADMIN_2_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), \"NAME\", \"ID\")\n", - "\n", - "pyramid_data <- pyramid_data %>%\n", - " distinct(across(all_of(c(ADMIN_1_ID, ADMIN_2_ID))), .keep_all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e912503-5c57-4997-8c68-da673bd14626", - "metadata": {}, - "outputs": [], - "source": [ - "print(dim(pyramid_data))\n", - "head(pyramid_data)" - ] - }, - { - "cell_type": "markdown", - "id": "78ec55d0-3a0d-413d-97cd-303895275f88", - "metadata": {}, - "source": [ - "## A) Taux de Soumission des Rapports / Dataset Reporting Rate\n", - "\n", - "**[FR]**\n", - "Cette section analyse le **taux de soumission des rapports**, tel que calculé dans le Système National d’Information Sanitaire (SNIS). Ce taux est défini comme le nombre de rapports effectivement reçus (rapports actuels) divisé par le nombre de rapports attendus (rapports attendus) sur une période donnée. Les rapports attendus correspondent au nombre de formations sanitaires qui, selon les paramètres du SNIS, devaient soumettre un rapport. Cet indicateur permet d’évaluer si les structures ont transmis les rapports requis, sans tenir compte du contenu ou de l’exhaustivité des données saisies.\n", - "\n", - "**[EN]**\n", - "This section analyzes the **dataset reporting rate**, as calculated in the Health Management Information System (HMIS). The rate is defined as the number of reports actually submitted (actual reports) divided by the number of reports expected (expected reports) over a given period. Expected reports refer to the number of health facilities that were required to report according to SNIS configuration. This indicator helps assess whether health facilities submitted their required reports, regardless of the content or completeness of the data within those reports." - ] - }, - { - "cell_type": "markdown", - "id": "793a685b-a5cc-4e12-9c78-e548beffa213", - "metadata": {}, - "source": [ - "**Question:** Can the reporting rate file be loaded only once (using a parameter see below)? -> if that's the case, we can remove specific plotting codes for the \"dataset\" and \"dataelement\" files and just keep one \"plotting code\" for both types \n", - "\n", - "**Suggestion (link to previous Question):** The file name can be parameterized by injecting the user selection via parameters={...} from the OpenHexa pipeline. \n", - "\n", - "> paste0(COUNTRY_CODE, \"_reporting_rate_\", REPORTING_RATE_METHOD ,\".parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2d9d428-c5a7-4f35-8a21-8f22adaa6a26", - "metadata": {}, - "outputs": [], - "source": [ - "# Import from Dataset\n", - "reporting_rate_data <- tryCatch({\n", - " # Attempt to load the dataset\n", - " get_latest_dataset_file_in_memory(\n", - " DATASET_NAME, \n", - " paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", - " )\n", - " }, \n", - " error = function(e) {\n", - " # If an error occurs, log a warning\n", - " # msg <- paste(\"[WARNING] Warning: Could not load reporting rate file for:\", COUNTRY_CODE, \". Proceeding with empty data. Error:\", conditionMessage(e))\n", - " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataset.parquet` does not exist, skipped loading. \n", - " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", - " log_msg(msg, level = \"warning\")\n", - " \n", - " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", - " return(\n", - " tibble(\n", - " YEAR = double(),\n", - " MONTH = double(),\n", - " ADM2_ID = character(),\n", - " REPORTING_RATE = double()\n", - " )\n", - " )\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d13dea1-e204-4c27-8a44-14e260bcdad1", - "metadata": {}, - "outputs": [], - "source": [ - "# Add _NAME cols from pyramid\n", - "if (nrow(reporting_rate_data) != 0) {\n", - " \n", - " ADMIN_2_ID <- str_replace(ADM_2, \"NAME\", \"ID\") \n", - " reporting_rate_data <- reporting_rate_data %>% \n", - " left_join(pyramid_data[c(ADM_2, ADMIN_2_ID)], by = c(\"ADM2_ID\" = ADMIN_2_ID))\n", - " \n", - " colnames(reporting_rate_data)[colnames(reporting_rate_data) == ADM_2] <- \"ADM2_NAME\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b96b0b12-e168-421f-b0a9-76e83c48842c", - "metadata": {}, - "outputs": [], - "source": [ - "head(reporting_rate_data, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "64d787c8-25d3-4b2c-87e9-6d35b206b018", - "metadata": {}, - "source": [ - "**fix:** \n", - " - Just replaced this line with the variable \"ADM2_NAME\" : \n", - "> Plot heatmap \n", - "> options(repr.plot.width = 18, repr.plot.height = 15) \n", - "> ggplot(reporting_rate_data, aes(x = date, y = **ADM2_NAME**, fill = category)) + " - ] - }, - { - "cell_type": "markdown", - "id": "c13f15a4-2788-4d57-9edc-78d0afdbe278", - "metadata": {}, - "source": [ - "### Plot: Heatmap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb3b565e-8497-4d88-8d6d-ae6b3e2929b2", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - " \n", - " # Prepare date column + category\n", - " reporting_rate_data <- reporting_rate_data %>%\n", - " mutate(\n", - " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", - " ADM2_ID = factor(ADM2_ID),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100, # `pmin()` caps to 100%\n", - " reporting_pct = REPORTING_RATE * 100,\n", - " category = cut(\n", - " reporting_pct,\n", - " # breaks = c(-Inf, 50, 80, 90, Inf),\n", - " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", - " # GP 2025-08-07 added this, but double check (seems too many >100!!)\n", - " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", - " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", - " right = TRUE # FALSE: intervals are left-closed: lower bound is included\n", - " )\n", - " )\n", - " \n", - " # Define color scale\n", - " reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50–80\" = \"#fdae61\", # orange\n", - " \"80–90\" = \"#ffffbf\", # yellow\n", - " \"90-100\" = \"#1a9641\", # green\n", - " \">100\" = \"darkgreen\"\n", - " )\n", - " \n", - " # Plot heatmap\n", - " options(repr.plot.width = 18, repr.plot.height = 15)\n", - " ggplot(reporting_rate_data, aes(x = date, y = ADM2_NAME, fill = category)) + # -> Using a ADM2_NAME Variable to select the column !!\n", - " geom_tile() +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Taux de soumission des rapports mensuels par district sanitaire\",\n", - " subtitle = \"Monthly Dataset Reporting Rate by Health District\",\n", - " x = \"Mois - Month\",\n", - " y = \"District Sanitaire - Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", - " # legend.position = \"right\",\n", - " legend.position = \"top\",\n", - " panel.grid = element_blank()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28662f31-8aa9-4f83-8dd6-8eb489723652", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Prepare the data\n", - "reporting_rate_data_box <- reporting_rate_data %>%\n", - " mutate(\n", - " MONTH = as.integer(MONTH),\n", - " YEAR = as.factor(YEAR),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100\n", - " reporting_pct = REPORTING_RATE * 100\n", - " )\n", - "\n", - "# Month labels in French\n", - "month_labels_fr <- c(\n", - " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", - " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", - ")\n", - "\n", - "# Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 15)\n", - "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", - " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", - " scale_x_discrete(labels = month_labels_fr) +\n", - " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", - " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", - " labs(\n", - " title = \"Distribution mensuelle du taux de soumission des rapports\",\n", - " subtitle = \"Monthly Distribution of Dataset Reporting Rate by Health District (2021–2024)\",\n", - " x = \"Mois\",\n", - " fill = \"Année\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " plot.subtitle = element_text(size = 16),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fd09749-2042-49d4-b2a3-9c2e6e5eae52", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Aggregate to annual reporting rate per district\n", - "annual_data <- reporting_rate_data %>%\n", - " group_by(YEAR, ADM2_ID) %>%\n", - " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", - " ungroup()\n", - "\n", - "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", - "map_data <- shapes_data %>%\n", - " left_join(annual_data, by = \"ADM2_ID\")\n", - "\n", - "# Step 3: Bin the reporting rate into categories\n", - "map_data <- map_data %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-79\", # \"50-80\"\n", - " reporting_rate < 0.9 ~ \"80-89\", # \"80-90\"\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " ),\n", - " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-79\", \"80-89\", \">=90\")) # levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", - " )\n", - "\n", - "# Step 4: Define colors\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\",\n", - " \"50-79\" = \"#fdae61\",\n", - " \"80-89\" = \"#ffffbf\",\n", - " \">=90\" = \"#1a9641\"\n", - ")\n", - "\n", - "# Step 5: Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 10)\n", - "ggplot(map_data) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " facet_wrap(~ YEAR) +\n", - " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", - " labs(\n", - " title = \"Taux de soumission des rapports annuels par district sanitaire\",\n", - " subtitle = \"Annual Dataset Reporting Completeness by Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " strip.text = element_text(face = \"bold\", size = 16),\n", - " plot.title = element_text(face = \"bold\")\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df3f7949-baff-4cd9-a022-594420765289", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", - "mean_reporting_stats <- map_data %>%\n", - " group_by(ADM2_ID) %>%\n", - " summarise(\n", - " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-80\",\n", - " reporting_rate < 0.9 ~ \"80-90\",\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " )\n", - " )\n", - "\n", - "# Set correct factor levels to match legend\n", - "mean_reporting_stats$reporting_cat <- factor(\n", - " mean_reporting_stats$reporting_cat,\n", - " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", - ")\n", - "\n", - "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", - "mean_reporting_map <- shapes_data %>%\n", - " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", - " st_as_sf()\n", - "\n", - "# Step 3: Define custom color scale\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50-80\" = \"#fdae61\", # orange\n", - " \"80-90\" = \"#ffffbf\", # yellow\n", - " \">=90\" = \"#1a9641\" # green\n", - ")\n", - "\n", - "# Step 4: Plot\n", - "options(repr.plot.width = 20, repr.plot.height = 10)\n", - "ggplot(mean_reporting_map) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\",\n", - " drop = FALSE\n", - " ) +\n", - " labs(\n", - " title = \"Taux moyen de soumission des rapports (toutes années confondues)\",\n", - " subtitle = \"Mean Annual Dataset Reporting Rate (All Years Combined)\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9497287c-bbd2-446f-946d-88e34233f9f0", - "metadata": {}, - "source": [ - "## B) Taux de rapportage des éléments de données: cas confirmés / Data element Reporting Rate: confirmed cases\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f8cd01e-e4b5-41f5-bff8-d35a1b143d0e", - "metadata": {}, - "outputs": [], - "source": [ - "# # import data\n", - "# # was: routine_data\n", - "# reporting_rate_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")) }, \n", - "# error = function(e) {\n", - "# msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", - "# # cat(msg)\n", - "# log_msg(msg, level = \"warning\") # GP 20250908\n", - "# # stop(msg) # GP 20250908\n", - "# })\n", - "\n", - "# reporting_rate_data <- reporting_rate_data %>%\n", - "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", - "\n", - "# printdim(reporting_rate_data)" - ] - }, - { - "cell_type": "markdown", - "id": "8c11349d-598a-4882-a156-3e5b969ab76c", - "metadata": {}, - "source": [ - "### Import and format data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7838692f-fe89-446e-bac5-af5cc7324226", - "metadata": {}, - "outputs": [], - "source": [ - "# config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f8eba51-b883-432b-8f7c-c4860cc9e78c", - "metadata": {}, - "outputs": [], - "source": [ - "# ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "# ADMIN_2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ceeba43a-e9ba-4b5a-8d0b-c4faade1367e", - "metadata": {}, - "outputs": [], - "source": [ - "# ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", - "# ADMIN_2_LEVEL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54bf61aa-c324-411f-8eea-93049d1bb252", - "metadata": {}, - "outputs": [], - "source": [ - "# THIS CODE SHOULD BE REMOVED, WE SHOULD ONLY LOAD REPORTING RATE ONCE IN THIS REPORT (parameter " - ] - }, - { - "cell_type": "markdown", - "id": "2338477e-2036-42e6-bfd3-c2b2480395c1", - "metadata": {}, - "source": [ - "**suggestion:** \n", - "- If possible I would try to reuse the same plotting code. So we can remove all the code as from here ...**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41da51da-7c8a-45d2-b492-a80071dfe2e3", - "metadata": {}, - "outputs": [], - "source": [ - "# Import from Dataset\n", - "\n", - "reporting_rate_data <- tryCatch({\n", - " # Attempt to load the dataset\n", - " get_latest_dataset_file_in_memory(\n", - " DATASET_NAME, \n", - " paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", - " )\n", - "}, \n", - "error = function(e) {\n", - " # If an error occurs, log a warning\n", - " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataelement.parquet` does not exist, skipped loading. \n", - " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", - " log_msg(msg, level = \"warning\")\n", - " \n", - " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", - " return(\n", - " tibble(\n", - " YEAR = double(),\n", - " MONTH = double(),\n", - " ADM2_ID = character(),\n", - " REPORTING_RATE = double()\n", - " )\n", - " )\n", - "})\n", - "\n", - "# Add _NAME cols from pyramid\n", - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - " ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", - " \n", - " reporting_rate_data <- reporting_rate_data %>%\n", - " # left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\")) # old\n", - " left_join(pyramid_data, by = c(\"ADM2_ID\" = ADMIN_2_LEVEL))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dff04b9c-0ee7-44d3-a84f-150bce1c368f", - "metadata": {}, - "outputs": [], - "source": [ - "# reporting_rate_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0be95b4e-8678-44be-a361-d2216bcd741c", - "metadata": {}, - "outputs": [], - "source": [ - "# if (nrow(reporting_rate_data) != 0) {\n", - "# reporting_rate_data <- reporting_rate_data %>%\n", - "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", - "# }" - ] - }, - { - "cell_type": "markdown", - "id": "42155cf0-0475-45fc-a276-7ff2bd4ed555", - "metadata": {}, - "source": [ - "### Plot: Heatmap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d4d40b6-7d01-4ed3-83ba-39ffdfbe4b3d", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare date column + category\n", - "\n", - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "reporting_rate_data <- reporting_rate_data %>%\n", - " mutate(\n", - " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", - " ADM2_ID = factor(ADM2_ID),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100,\n", - " reporting_pct = REPORTING_RATE * 100,\n", - " category = cut(\n", - " reporting_pct,\n", - " # breaks = c(-Inf, 50, 80, 90, Inf),\n", - " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", - " # right = FALSE\n", - " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", - " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", - " right = TRUE\n", - " )\n", - " )\n", - "\n", - "# # Define color scale\n", - "# reporting_colors <- c(\n", - "# \"<50\" = \"#d7191c\", # red\n", - "# \"50–80\" = \"#fdae61\", # orange\n", - "# \"80–90\" = \"#ffffbf\", # yellow\n", - "# \"≥90\" = \"#1a9641\" # green\n", - "# )\n", - "\n", - "# Define color scale\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50–80\" = \"#fdae61\", # orange\n", - " \"80–90\" = \"#ffffbf\", # yellow\n", - " \"90-100\" = \"#1a9641\", # green\n", - " \">100\" = \"darkgreen\" # \"darkgreen\"\n", - ")\n", - "\n", - "# Plot heatmap\n", - "options(repr.plot.width = 18, repr.plot.height = 15)\n", - "ggplot(reporting_rate_data, aes(x = date, y = LEVEL_3_NAME, fill = category)) +\n", - " geom_tile() +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Taux de rapportage mensuels par district sanitaire\",\n", - " subtitle = \"Monthly Data Element Reporting Rate by Health District\",\n", - " x = \"Mois - Month\",\n", - " y = \"District Sanitaire - Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", - " legend.position = \"top\", # \"right\"\n", - " panel.grid = element_blank()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "b14436cf-1f78-4c3f-944b-0c1eb845a3f6", - "metadata": {}, - "source": [ - "### Plot: boxplot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba3a3e89-17f5-4024-b039-dcedb0f37dc2", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare the data\n", - "\n", - "if (nrow(reporting_rate_data) != 0) {\n", - " \n", - "reporting_rate_data_box <- reporting_rate_data %>%\n", - " mutate(\n", - " MONTH = as.integer(MONTH),\n", - " YEAR = as.factor(YEAR),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100 # `pmin()` caps values to 1 (then, 100%)\n", - " reporting_pct = REPORTING_RATE * 100\n", - " )\n", - "\n", - "# Month labels in French\n", - "month_labels_fr <- c(\n", - " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", - " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1472d8f2-56d3-4b7a-82ef-4b344d87d264", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 15)\n", - "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", - " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", - " scale_x_discrete(labels = month_labels_fr) +\n", - " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", - " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", - " labs(\n", - " title = \"Distribution mensuelle du taux de rapportage\",\n", - " subtitle = \"Monthly Distribution of Data Element Reporting Rate by Health District (2021–2024)\",\n", - " x = \"Mois\",\n", - " fill = \"Année\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " plot.subtitle = element_text(size = 16),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "0d275753-d21e-410a-b7d7-f265ac6e9235", - "metadata": {}, - "source": [ - "### Plot: choropleth" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12674c7e-7745-464c-9cc5-3c1e6dcd63c4", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Aggregate to annual reporting rate per district \n", - "annual_data <- reporting_rate_data %>%\n", - " group_by(YEAR, ADM2_ID) %>%\n", - " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", - " ungroup()\n", - "\n", - "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", - "map_data <- shapes_data %>%\n", - " left_join(annual_data, by = \"ADM2_ID\")\n", - "\n", - "# Step 3: Bin the reporting rate into categories\n", - "map_data <- map_data %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-80\",\n", - " reporting_rate < 0.9 ~ \"80-90\",\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " ),\n", - " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\"))\n", - " )\n", - "\n", - "# Step 4: Define colors\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\",\n", - " \"50-80\" = \"#fdae61\",\n", - " \"80-90\" = \"#ffffbf\",\n", - " \">=90\" = \"#1a9641\"\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7777a718-2e6c-4f80-a7d6-ae304a1b49fb", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 5: Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 10)\n", - "ggplot(map_data) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " facet_wrap(~ YEAR) +\n", - " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", - " labs(\n", - " title = \"Taux de rapportage des éléments de donnée annuels par district sanitaire\",\n", - " subtitle = \"Annual Data element Reporting Completeness by Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " strip.text = element_text(face = \"bold\", size = 16),\n", - " plot.title = element_text(face = \"bold\")\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "f25ec0fb-758b-476c-8567-b0dce0a387d1", - "metadata": {}, - "source": [ - "### Plot: choropleth 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c891e330-f56f-4846-88c3-fd13a9fac8e7", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", - "mean_reporting_stats <- map_data %>%\n", - " group_by(ADM2_ID) %>%\n", - " summarise(\n", - " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-80\",\n", - " reporting_rate < 0.9 ~ \"80-90\",\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " )\n", - " )\n", - "\n", - "# Set correct factor levels to match legend\n", - "mean_reporting_stats$reporting_cat <- factor(\n", - " mean_reporting_stats$reporting_cat,\n", - " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", - ")\n", - "\n", - "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", - "mean_reporting_map <- shapes_data %>%\n", - " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", - " st_as_sf()\n", - "\n", - "# Step 3: Define custom color scale\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50-80\" = \"#fdae61\", # orange\n", - " \"80-90\" = \"#ffffbf\", # yellow\n", - " \">=90\" = \"#1a9641\" # green\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4032f2be-dc1e-48cf-9a5f-5c0854c32e9a", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 4: Plot\n", - "options(repr.plot.width = 20, repr.plot.height = 10)\n", - "ggplot(mean_reporting_map) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\",\n", - " drop = FALSE\n", - " ) +\n", - " labs(\n", - " title = \"Taux moyen de rapportage (toutes années confondues)\",\n", - " subtitle = \"Mean Annual Data Element Reporting Rate (All Years Combined)\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0f1ea54-b02b-4523-b8b7-9dcdf30c39ba", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r b/pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r new file mode 100644 index 0000000..bc84a21 --- /dev/null +++ b/pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r @@ -0,0 +1,79 @@ +# Shared helpers for snt_dhis2_reporting_rate notebooks. + +inspect_reporting_rate <- function(data_tibble) { + tibble_name_full <- deparse(substitute(data_tibble)) + method <- stringr::str_extract(tibble_name_full, "(?<=reporting_rate_).*") + + values_greater_than_1 <- sum(data_tibble$REPORTING_RATE > 1, na.rm = TRUE) + total_values <- length(data_tibble$REPORTING_RATE) + + if (total_values > 0) { + proportion <- values_greater_than_1 / total_values * 100 + min_rate <- min(data_tibble$REPORTING_RATE, na.rm = TRUE) + max_rate <- max(data_tibble$REPORTING_RATE, na.rm = TRUE) + } else { + proportion <- 0 + min_rate <- NA + max_rate <- NA + } + + clarification <- if (proportion == 0) NULL else " (there are more reports than expected)" + + log_msg( + paste0( + "🔍 For reporting rate method : `", method, "`, the values of REPORTING_RATE range from ", round(min_rate, 2), + " to ", round(max_rate, 2), + ", and ", round(proportion, 2), " % of values are >1", clarification, "." + ) + ) + + hist(data_tibble$REPORTING_RATE, breaks = 50) +} + +is_aire_l5 <- function(x) { + stringr::str_detect(x, stringr::regex("^\\s*aire[^a-zA-Z]?", ignore_case = TRUE)) +} + +is_hospital_l4 <- function(x) { + stringr::str_detect(x, stringr::regex("^(hd|chr|chu|hgr)", ignore_case = TRUE)) +} + +snt_write_csv <- function(x, output_data_path, method, country_code = NULL) { + if (is.null(country_code) && exists("COUNTRY_CODE")) { + country_code <- get("COUNTRY_CODE") + } + if (is.null(country_code)) { + stop("country_code is required to export reporting rate csv.") + } + + full_directory_path <- file.path(output_data_path, "reporting_rate") + if (!dir.exists(full_directory_path)) { + dir.create(full_directory_path, recursive = TRUE) + } + + file_path <- file.path(full_directory_path, paste0(country_code, "_reporting_rate_", method, ".csv")) + readr::write_csv(x, file_path) + log_msg(paste0("Exported : ", file_path)) +} + +snt_write_parquet <- function(x, output_data_path, method, country_code = NULL) { + if (is.null(country_code) && exists("COUNTRY_CODE")) { + country_code <- get("COUNTRY_CODE") + } + if (is.null(country_code)) { + stop("country_code is required to export reporting rate parquet.") + } + + full_directory_path <- file.path(output_data_path, "reporting_rate") + if (!dir.exists(full_directory_path)) { + dir.create(full_directory_path, recursive = TRUE) + } + + file_path <- file.path(full_directory_path, paste0(country_code, "_reporting_rate_", method, ".parquet")) + arrow::write_parquet(x, file_path) + log_msg(paste0("Exported : ", file_path)) +} + +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} From b525673a749119cf9d2a31c1f41c4969f47badb7 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Mon, 30 Mar 2026 17:15:18 +0200 Subject: [PATCH 17/23] main version of incidence --- .../code/snt_dhis2_incidence.ipynb | 1050 +++++------------ .../utils/snt_dhis2_incidence.r | 564 ++++++++- 2 files changed, 842 insertions(+), 772 deletions(-) diff --git a/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb b/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb index c0cb6eb..a2a4d97 100644 --- a/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb +++ b/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb @@ -2,65 +2,10 @@ "cells": [ { "cell_type": "markdown", - "id": "f5827740-2917-4504-9017-9ec7d408e5f4", - "metadata": {}, - "source": [ - "Script structure:\n", - " 1. Setup:\n", - " * Paths\n", - " * Utils functions\n", - " * Load and check config file\n", - " 2. Load Data\n", - " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", - " * **Population data** (DHIS2) already formatted & aggregated (output of pipeline YYY) & aggregated at **ADM2 x YEAR** level
\n", - " **Note**: in some Countries (i.e., Niger), population and and crude incidence data is also available for **specific sections** of the popultion (i.e., preganant women, children under 5)\n", - " * (optional) **Care seeking (taux recherche soins)** (DHS)\n", - " * **Reporting Rate**, based on what is available (last run reporting rate pipeline), uses _either_ one of:\n", - " * \"**Dataset**\": pre-cumputed (directly downloadable from SNIS DHIS2 instance) and formatted&aligned elsewhere (output of pipelibe `dhis2-reporting-rate`)\n", - " * \"**Data Element**: calculated from routine DHIS2 data, based on reports for defined indicators and \"active\" facilities\n", - " 3. Calculate **Incidence**\n", - " 1. calculate **monthly cases**\n", - " 2. calculate **yearly incidence**: Crude, Adjusted 1 (Test Positivity Rate), Adjusted 2 (Reporting Rate), (optional) Adjusted 3 (Care Seeking Behaviour)" - ] - }, - { - "cell_type": "markdown", - "id": "cdd5409b-dc0e-45f4-ae4e-dffcdb25059b", - "metadata": {}, - "source": [ - "-------------------\n", - "**Naming harmonization to improve code readability:**\n", - "\n", - "**Incidence**, COLUMN NAMES (always capitalized!):\n", - "* \"INCIDENCE_CRUDE\" = \"Crude\"\n", - "* \"INCIDENCE_ADJ_TESTING\" = \"Adjusted 1 (Testing)\"\n", - "* \"INCIDENCE_ADJ_REPORTING\" = \"Adjusted 2 (Reporting)\"\n", - "* _\"INCIDENCE_ADJ_CARESEEKING\" = \"Adjusted 3 (Careseeking)\"_ ⚠️is this good naming?" - ] - }, - { - "cell_type": "markdown", - "id": "96d5dffc-ff34-4a14-b2b7-1e71e6afad07", - "metadata": {}, - "source": [ - "**Reporting Rate** data frames, based on two **methods**:\n", - "* follwo this structure: reporting\\_rate\\_\\. So:\n", - " * **Dataset**: `reporting_rate_dataset` (for report nb only: `reporting_rate_dataset_year`)\n", - " * **Data Element** (Diallo 2025): `reporting_rate_dataelement` (for report nb only: `reporting_rate_dataelement_year`)" - ] - }, - { - "cell_type": "markdown", - "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", - "metadata": {}, - "source": [ - "--------------------" - ] - }, - { - "cell_type": "markdown", - "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", - "metadata": {}, + "id": "bf86fe04", + "metadata": { + "tags": [] + }, "source": [ "## 1. Setup" ] @@ -68,7 +13,9 @@ { "cell_type": "markdown", "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "### 1.0. Fallback parameters" ] @@ -78,6 +25,9 @@ "execution_count": null, "id": "72fad25e-85fd-4ae9-8fe3-c142077f8d67", "metadata": { + "tags": [ + "parameters" + ], "vscode": { "languageId": "r" } @@ -85,43 +35,75 @@ "outputs": [], "source": [ "# ----- ⚡ Defined in pipeline.py code ---------------\n", + "ROOT_PATH <- if (exists(\"ROOT_PATH\")) ROOT_PATH else \"/home/hexa/workspace\"\n", + "SNT_ROOT_PATH <- if (exists(\"SNT_ROOT_PATH\")) SNT_ROOT_PATH else ROOT_PATH\n", "if (!exists(\"N1_METHOD\")) N1_METHOD <- \"SUSP-TEST\" # ⚡ For N1 calculations: use `SUSP-TEST` or `PRES`\n", "if (!exists(\"ROUTINE_DATA_CHOICE\")) ROUTINE_DATA_CHOICE <- \"raw\" # \"raw\" \"raw_without_outliers\" \"imputed\"\n", "if (!exists(\"USE_CSB_DATA\")) USE_CSB_DATA <- FALSE # ⚡ USE_CSB_DATA bool\n", - "if (!exists(\"USE_ADJUSTED_POPULATION\")) USE_ADJUSTED_POPULATION <- FALSE # ⚡ USE_ADJUSTED_POPULATION bool " + "if (!exists(\"USE_ADJUSTED_POPULATION\")) USE_ADJUSTED_POPULATION <- FALSE # ⚡ USE_ADJUSTED_POPULATION bool \n", + "\n", + "# 👥 Population Disaggregation \n", + "# Only for countries in which disaggregated data is available. Pipeline fails if you select something you don't have.\n", + "if (!exists(\"DISAGGREGATION_SELECTION\")) DISAGGREGATION_SELECTION <- NULL # Options: \"PREGNANT_WOMAN\", \"UNDER_5\", ... \n", + "# Disaggregation options set in pipeline.py parameters, based on \n", + "# https://bluesquare.atlassian.net/browse/SNT25-363?focusedCommentId=85587" ] }, { "cell_type": "markdown", - "id": "d7d2f065-f8ad-4580-aa24-64a6d9bd7acb", - "metadata": {}, + "id": "ecff4a51-c6fa-4e84-a465-5bb87d9b1333", + "metadata": { + "tags": [] + }, "source": [ - "#### 👥 Population Disaggregation \n", - "Only for countries in which disaggregated data is available. Pipeline fails if you select something you don't have." + "### 1.1. Run setup" ] }, { "cell_type": "code", "execution_count": null, - "id": "63362c4a-6a55-4310-aa7a-81bea39aa734", + "id": "7bf70b5f", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "if (!exists(\"DISAGGREGATION_SELECTION\")) DISAGGREGATION_SELECTION <- \"UNDER_5\" # NULL # Options: \"PREGNANT_WOMAN\", \"UNDER_5\", ... \n", - "# Disaggregation options set in pipeline.py parameters, based on \n", - "# https://bluesquare.atlassian.net/browse/SNT25-363?focusedCommentId=85587" + "PIPELINE_PATH <- if (exists(\"PIPELINE_PATH\")) PIPELINE_PATH else file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_incidence\")\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_incidence.r\"))" ] }, { - "cell_type": "markdown", - "id": "ecff4a51-c6fa-4e84-a465-5bb87d9b1333", - "metadata": {}, + "cell_type": "code", + "execution_count": null, + "id": "f60c5875", + "metadata": { + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ - "### 1.1. Run setup" + "load_utils()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50e73623", + "metadata": { + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "setup_paths()\n", + "create_intermediate_data_dir()" ] }, { @@ -129,57 +111,39 @@ "execution_count": null, "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# PROJECT PATHS\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_incidence\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2', 'incidence') # store the output of the pipeline (only final results)\n", - "INTERMEDIATE_DATA_PATH <- file.path(DATA_PATH, \"intermediate_results\") # intermediate results for reporting nb or else, NOT for OH Dataset!\n", - "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\")) # utils\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\")) # palettes \n", - "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_incidence.r\"))\n", - "\n", "# List required pcks\n", "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right path\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "install_and_load(required_packages)" ] }, { "cell_type": "code", "execution_count": null, - "id": "22dbb20b", + "id": "8f317748", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "if (!dir.exists(INTERMEDIATE_DATA_PATH)) {\n", - " dir.create(INTERMEDIATE_DATA_PATH, recursive = TRUE)\n", - " log_msg(glue(\"Created directory for intermediate results: {INTERMEDIATE_DATA_PATH}\"))\n", - "}" + "set_env_openhexa()" ] }, { "cell_type": "markdown", "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "### 1.2. Load and check `config` file\n", "\n", @@ -189,63 +153,37 @@ { "cell_type": "code", "execution_count": null, - "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", + "id": "443bd9aa", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")) \n", - "log_msg(msg)\n", - "\n", - "# Generic\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"ANY\"\n", - "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "\n", - "# Fixed routine formatting columns\n", - "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') \n", - "print(paste(\"Fixed routine data ('dhis2_routine') columns (always expected): \", paste(fixed_cols, collapse=\", \")))" + "import_config_json()\n", + "config_generic()\n", + "config_incidence()\n", + "set_fixed_cols()" ] }, { "cell_type": "markdown", "id": "95006478", - "metadata": {}, - "source": [ - "### 1.3. Helper function(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa504ca5-928c-4778-ad31-5c4de7bbbf60", "metadata": { - "vscode": { - "languageId": "r" - } + "tags": [] }, - "outputs": [], "source": [ - "# resolve_routine_filename() loaded from utils/snt_dhis2_incidence.r" + "### 1.3. Helper function(s)" ] }, { "cell_type": "markdown", "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "## 2. Load Data" ] @@ -253,191 +191,91 @@ { "cell_type": "markdown", "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", - "metadata": {}, - "source": [ - "### 2.1. **Routine** data (DHIS2) (parametrized choice)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ddb31b18", "metadata": { - "vscode": { - "languageId": "r" - } + "tags": [] }, - "outputs": [], "source": [ - "# select routine dataset and filename\n", - "if (ROUTINE_DATA_CHOICE == \"raw\") { \n", - " routine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - " routine_name <- resolve_routine_filename(ROUTINE_DATA_CHOICE)\n", - " routine_filename <- paste0(COUNTRY_CODE, routine_name)\n", - "} else { \n", - " routine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - " routine_name <- resolve_routine_filename(ROUTINE_DATA_CHOICE)\n", - " routine_filename <- paste0(COUNTRY_CODE, routine_name)\n", - "}" + "### 2.1. **Routine** data (DHIS2) (parametrized choice)" ] }, { "cell_type": "code", "execution_count": null, - "id": "0849f905", + "id": "9379ca4b", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Warn when incidence routine choice differs from latest reporting-rate routine choice\n", - "rr_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "rr_parameters <- tryCatch(\n", - " {\n", - " get_latest_dataset_file_in_memory(rr_dataset_name, paste0(COUNTRY_CODE, \"_parameters.json\"))\n", - " },\n", - " error = function(e) {\n", - " log_msg(\n", - " paste0(\n", - " \"[WARNING] Could not load reporting-rate parameters file from dataset `\",\n", - " rr_dataset_name,\n", - " \"` (\",\n", - " conditionMessage(e),\n", - " \"). Skipping consistency check between reporting rate and incidence routine data choices.\"\n", - " ),\n", - " \"warning\"\n", - " )\n", - " return(NULL)\n", - " }\n", - ")\n", - "\n", - "reporting_routine_choice <- infer_reporting_routine_choice(rr_parameters)\n", - "if (!is.null(reporting_routine_choice) && reporting_routine_choice != ROUTINE_DATA_CHOICE) {\n", - " log_msg(\n", - " paste0(\n", - " \"[WARNING] Routine data mismatch detected: incidence is running with `\",\n", - " ROUTINE_DATA_CHOICE,\n", - " \"` while the latest reporting-rate run used `\",\n", - " reporting_routine_choice,\n", - " \"`. Please ensure this is intentional.\"\n", - " ),\n", - " \"warning\"\n", - " )\n", - "}\n" + "select_routine_dataset_and_filename()" ] }, { "cell_type": "code", "execution_count": null, - "id": "30691d35-f859-4f92-8eb2-5791a425f153", + "id": "d5160802", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Load file from dataset \n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(routine_dataset_name, routine_filename) }, \n", - " error = function(e) { \n", - " # Check if the error message indicates that the file does not exist \n", - " if (grepl(\"does not exist\", conditionMessage(e), ignore.case = TRUE)) { \n", - " msg <- paste0(\"[ERROR] File not found! 🛑 The file `\", routine_filename, \"` does not exist in `\", \n", - " routine_dataset_name, \"`. To generate it, execute the pipeline `DHIS2 Outliers Removal and Imputation`, choosing the appropriate method.\")\n", - " } else {\n", - " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file for: \", COUNTRY_CODE, \". [ERROR DETAILS] \" , conditionMessage(e))\n", - " } \n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"DHIS2 routine data : `\", routine_filename, \"` loaded from dataset : `\", routine_dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", - "log_msg(msg)\n", - "\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)" + "load_dhis2_routine_data()" ] }, { "cell_type": "markdown", "id": "b78c12ec-407f-4088-9a7f-08838b2d208b", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ - "#### Checks on routine data columns" + "#### Checks" ] }, { "cell_type": "markdown", - "id": "b1dcb02d", - "metadata": {}, + "id": "3243602c", + "metadata": { + "tags": [] + }, "source": [ - " `fixed_cols`: Fixed columns that should be always present regardless of the config." + "... on routine data columns" ] }, { "cell_type": "code", "execution_count": null, - "id": "b3514f20-3726-436e-b34b-7a171d1718d4", + "id": "c0aec2ea", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Check if all \"fixed\" cols are present in dhis2_routine\n", - "actual_cols <- colnames(dhis2_routine) # dhis2_routine\n", - "missing_cols <- setdiff(fixed_cols, actual_cols) # Columns in fixed_cols but not in actual_cols)\n", + "# `fixed_cols`: Fixed columns that should be always present regardless of the config.\n", + "check_fixed_cols_in_routine()\n", "\n", - "# Check if all required columns are present\n", - "all_present <- length(missing_cols) == 0\n", - "if (all_present) { \n", - " log_msg(paste0(\"The 'dhis2_routine' tibble contains all the expected 'fixed' columns: \", paste(fixed_cols, collapse = \", \"), \".\"))\n", - "} else {\n", - " log_msg(paste0(\"🚨 Missing Columns: The following required columns are NOT present in 'dhis2_routine': \", paste(missing_cols, collapse = \", \"), \".\"), \"warning\")\n", - "}" + "# `DHIS2_INDICATORS`: Indicators, as defined in the config.json file, \n", + "# are expected to be present if the extraction pipeline and this pipeline are run on the same config settings.\n", + "check_dhis2_indicators_cols_in_routine()" ] }, { "cell_type": "markdown", - "id": "cd203dec-61b2-4510-9c84-30054e7b99e2", - "metadata": {}, - "source": [ - "`DHIS2_INDICATORS`: Indicators, as defined in the config.json file, are expected to be present if the extraction pipeline and this pipeline are run on the same config settings." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb04b888-8c5e-452a-8eb4-96025b0fa65a", + "id": "09e8a5a9", "metadata": { - "vscode": { - "languageId": "r" - } + "tags": [] }, - "outputs": [], - "source": [ - "# Check if all \"DHIS2_INDICATORS\" cols are present in dhis2_routine\n", - "missing_cols <- setdiff(DHIS2_INDICATORS, actual_cols) # all elements in DHIS2_INDICATORS but not in actual_cols\n", - "all_present <- length(missing_cols) == 0\n", - "if (all_present) { \n", - " log_msg(paste0(\"The 'dhis2_routine' tibble contains all the expected 'DHIS2_INDICATORS' columns: \", paste(DHIS2_INDICATORS, collapse = \", \"), \".\"))\n", - "} else {\n", - " log_msg(paste0(\n", - " \"🚨 Missing Columns: The following columns for DHIS2 INDICATORS are NOT present in 'dhis2_routine': \",\n", - " paste(missing_cols, collapse = \", \"),\n", - " \".\\n🚨 Looks like the config.json file was modified after extraction.\\n🚨 The analysis will continue WITHOUT the missing indicators.\"\n", - " ), \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3ba1a6e8-aa08-4624-a6a5-4852bf4127e4", - "metadata": {}, "source": [ - "#### Checks on `N1_METHOD` selected\n", + "... on `N1_METHOD` selected:
\n", "_**if**_ `N1_METHOD == PRES` then `PRES` must exist in config.json file _and_ in routine data
\n", "_**else**_ N1 will use `SUSP-TEST` instead" ] @@ -445,181 +283,94 @@ { "cell_type": "code", "execution_count": null, - "id": "96a7025e-083b-464d-8498-f7fdff493293", + "id": "cc258e7a", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Check that col `PRES` exists in both config file and routine data\n", - "if (N1_METHOD == \"PRES\") {\n", - " pres_in_routine <- any(names(dhis2_routine) == \"PRES\")\n", - " pres_in_config <- any(DHIS2_INDICATORS == \"PRES\")\n", - "\n", - " if (!pres_in_routine) {\n", - " log_msg(\"🛑 Column `PRES` missing from routine data! 🚨 N1 calculations will use `SUSP-TEST` instead!\", \"error\")\n", - " stop()\n", - " }\n", - " if (!pres_in_config) {\n", - " log_msg(\"⚙️ Note: `PRES` set as parameter in this pipeline, but not defined as indicator in the configuration file (SNT_config.json)\", \"error\")\n", - " stop()\n", - " }\n", - "}" + "check_PRES_col()" ] }, { "cell_type": "markdown", - "id": "1c5e84cf", - "metadata": {}, + "id": "88b833f6", + "metadata": { + "tags": [] + }, "source": [ - "#### 👥 Population Disaggregation logic" + "### 2.2. **Population** data at level `ADM2` x `YEAR`" ] }, { "cell_type": "code", "execution_count": null, - "id": "458e3d78-3552-4447-93f8-6812a5d655be", + "id": "001f6bf9", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "INDICATORS_FOUND <- FALSE # 👈 \n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && !is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c(\"SUSP-TEST\", \"PRES\")) {\n", - "if (!is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c(\"SUSP-TEST\", \"PRES\")) {\n", - "\n", - " # Determine the dynamic prefix based on the method\n", - " prefix_method <- ifelse(N1_METHOD == \"SUSP-TEST\", \"SUSP\", \"PRES\")\n", - " prefix_all <- c(prefix_method, \"TEST\", \"CONF\") \n", - " # Define the expected column names \n", - " # (also make available for the 'else' warning message if the check fails)\n", - " target_colnames <- glue(\"{prefix_all}_{DISAGGREGATION_SELECTION}\")\n", - " \n", - " if (all(target_colnames %in% colnames(dhis2_routine))) {\n", - " \n", - " # We map the specific columns (e.g., SUSP_UNDER5) to generic names (e.g., SUSP)\n", - " dhis2_routine[prefix_all] <- dhis2_routine[target_colnames]\n", - " \n", - " for (col in target_colnames) {\n", - " log_msg(glue(\"Population Disaggregation: Successfully mapped indicator: {col}\"))\n", - " }\n", - " \n", - " # Signal success for the next code block\n", - " INDICATORS_FOUND <- TRUE\n", - " \n", - " } else {\n", - " missing_cols <- setdiff(target_colnames, colnames(dhis2_routine))\n", - " log_msg(glue(\"Population Disaggregation: Disaggregation on '{DISAGGREGATION_SELECTION}' failed.\"), \"warning\")\n", - " log_msg(glue(\"Population Disaggregation: Missing columns in routine dataset: {paste(missing_cols, collapse = ', ')}\"), \"warning\")\n", - " \n", - " msg <- glue(\"[ERROR] 🛑 Population Disaggregation: Required columns for disaggregation '{DISAGGREGATION_SELECTION}' are missing.\") \n", - " stop(msg)\n", - " }\n", - "}" + "load_population_data()" ] }, { "cell_type": "markdown", - "id": "4473e75e-94d2-4f24-b6eb-38a7685542ad", - "metadata": {}, - "source": [ - "### 2.2. Load population data at level ADM2 x YEAR\n", - "\n", - "Already formatted & aggregated. \n", - "\n", - "**Expecting** table with these **cols** (bold = **must have**): \n", - "* ADM1_ID\n", - "* **ADM2_ID**\n", - "* **YEAR**\n", - "* **POPULATION** (pop at ADM2 level)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ce922c7-6dab-44cf-a94f-8a03d1f816a1", + "id": "1c5e84cf", "metadata": { - "vscode": { - "languageId": "r" - } + "tags": [] }, - "outputs": [], "source": [ - "# Select population file \n", - "if (USE_ADJUSTED_POPULATION) {\n", - " dhis2_pop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_POPULATION_TRANSFORMATION\n", - "} else {\n", - " dhis2_pop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "}\n", - " \n", - "# Load file from dataset\n", - "dhis2_population_adm2 <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_pop_dataset, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 population file for: \" , COUNTRY_CODE, \n", - " \" [ERROR DETAILS] \", conditionMessage(e)) # log error message , \n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "log_msg(glue(\"DHIS2 population data loaded from dataset: {dhis2_pop_dataset}. Dataframe dimensions: {paste(dim(dhis2_population_adm2), collapse=', ')}\"))" + "### 👥 Population Disaggregation logic" ] }, { "cell_type": "code", "execution_count": null, - "id": "c7163965", + "id": "1f377717", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "dhis2_population_adm2 |> head()" - ] - }, - { - "cell_type": "markdown", - "id": "6ae0c5fa", - "metadata": {}, - "source": [ - "#### 👥 Population Disaggregation logic" + "# 👥 Resolve col names for disaggregated values\n", + "prepare_disaggregated_indicators(dhis2_routine, DISAGGREGATION_SELECTION, N1_METHOD)" ] }, { "cell_type": "code", "execution_count": null, - "id": "c8620491", + "id": "6905e42c", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "if (INDICATORS_FOUND) { \n", - " POPULATION_SELECTION <- paste0(\"POP_\", DISAGGREGATION_SELECTION) \n", - " if (!(POPULATION_SELECTION %in% colnames(dhis2_population_adm2))) {\n", - " log_msg(glue(\"Population Disaggregation: Column '{POPULATION_SELECTION}' not found in Population dataset.\"), \"warning\")\n", - " POPULATION_SELECTION <- \"POPULATION\"\n", - " }\n", - " # The selected column is assigned to POPULATION col so that later code can use it generically\n", - " dhis2_population_adm2$POPULATION <- dhis2_population_adm2[[POPULATION_SELECTION]]\n", - " log_msg(glue(\"Population Disaggregation: Column '{POPULATION_SELECTION}' selected as population values.\"))\n", - "}" + "# 👥 Replace col `POPULATION` with selected disaggregated population col (if found)\n", + "# (aka \"mapping\")\n", + "select_population_column(dhis2_population_adm2, DISAGGREGATED_INDICATORS_FOUND, DISAGGREGATION_SELECTION)" ] }, { "cell_type": "markdown", "id": "e596d0ed-56df-4756-83ed-717cfa72f643", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ - "#### 2.2.1 **Population** data (DHIS2) columns selection.\n" + "#### 2.2.1 **Population** data columns selection.\n" ] }, { @@ -627,69 +378,51 @@ "execution_count": null, "id": "d5107756-f007-4c39-a4f6-b2ab0a653bd5", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "dhis2_population_adm2 <- dhis2_population_adm2 %>% select(YEAR, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, POPULATION) \n", + "dhis2_population_adm2 <- dhis2_population_adm2 |> \n", + "select(YEAR, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, POPULATION) \n", "\n", "dim(dhis2_population_adm2)\n", - "head(dhis2_population_adm2, 2)" + "head(dhis2_population_adm2, 3)" ] }, { "cell_type": "markdown", "id": "b42a65ab-ad8d-41ba-9edb-dc2636f03a06", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ - "### 2.3. (optional) **Care Seeking Behaviour** (CSB DHS) (taux recherche soins)\n", - "(20250728) Note: **changed units** (proportion to %), see https://bluesquare.atlassian.net/browse/SNT25-127 " + "### 2.3. (optional) **Care Seeking Behaviour** (CSB DHS) (taux recherche soins)" ] }, { "cell_type": "code", "execution_count": null, - "id": "0a6a5338-9ffd-47d2-b92f-79deb7886078", + "id": "bb277393", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHS_INDICATORS\n", - "file_name <- glue::glue(\"{COUNTRY_CODE}_DHS_ADM1_PCT_CARESEEKING_SAMPLE_AVERAGE.parquet\")\n", - "\n", - "if (USE_CSB_DATA == TRUE) {\n", - " # Read the data, if error (cannot find at defined path) -> set careseeking_data to NULL (so it doesn't break the function at # 3.)\n", - " careseeking_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"🛑 Error while loading DHS Care Seeking data file from `\", dataset_name, file_name ,\"`.\", conditionMessage(e)) # log error message\n", - " log_msg(msg, \"error\")\n", - " return(NULL) # make object NULL on error\n", - " })\n", - " \n", - " # Only print success messages and data info if careseeking_data is NOT NULL\n", - " if (!is.null(careseeking_data)) {\n", - " log_msg(paste0(\"Care Seeking data : \", file_name, \" loaded from dataset : \", dataset_name))\n", - " log_msg(paste0(\"Care Seeking data frame dimensions: \", nrow(careseeking_data), \" rows, \", ncol(careseeking_data), \" columns.\"))\n", - " head(careseeking_data)\n", - " } else {\n", - " log_msg(paste0(\"🚨 Care-seeking data not loaded due to an error, `careseeking_data` is set to `NULL`!\"), \"warning\")\n", - " }\n", - " \n", - "} else {\n", - " # if `USE_CSB_DATA == FALSE` ... (basically, ignore CSB data)\n", - " careseeking_data <- NULL\n", - "}" + "load_careseeking_data()" ] }, { "cell_type": "markdown", "id": "92723594-000b-41ee-82a1-8e69106a277d", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "### 2.4. Load Reporting Rate \n", "\n", @@ -701,54 +434,24 @@ { "cell_type": "code", "execution_count": null, - "id": "d5722d64-ce61-4244-960e-57ebac28e4cf", + "id": "1b6e0549", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# function**\n", - "# Define dataset and file name (based on paramter)\n", - "rr_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "file_name_de <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", - "file_name_ds <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", - "\n", - "# Try loading dataelement reporting rates.\n", - "reporting_rate_month <- tryCatch({\n", - " df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_de)\n", - " log_msg(glue(\"Reporting Rate data: `{file_name_de}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df), collapse=', ')}\"))\n", - " REPORTING_RATE_METHOD <- \"dataelement\"\n", - " df_loaded\n", - "}, \n", - " error = function(e) { \n", - " cat(glue(\"[ERROR] Error while loading Reporting Rate 'dataelement' version for: {COUNTRY_CODE} {conditionMessage(e)}\"))\n", - " return(NULL)\n", - "})\n", - "\n", - "# Try loading dataset reporting rates.\n", - "if (is.null(reporting_rate_month)) {\n", - " reporting_rate_month <- tryCatch({\n", - " df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_ds) \n", - " log_msg(glue(\"Reporting Rate data: `{file_name_ds}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df), collapse=', ')}\"))\n", - " REPORTING_RATE_METHOD <- \"dataset\"\n", - " df_loaded\n", - " }, \n", - " error = function(e) { \n", - " stop(glue(\"[ERROR] Error while loading Reporting Rate 'dataset' version for: {COUNTRY_CODE} {conditionMessage(e)}\")) # raise error\n", - " })\n", - "}\n", - "\n", - "rm(df_loaded)\n", - "dim(reporting_rate_month)\n", - "head(reporting_rate_month, 2)" + "load_reporting_rate_data()" ] }, { "cell_type": "markdown", "id": "9d2529ad-8436-43c4-85b3-ed1ad9621e1e", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 🔍 Checkon data completeness for `REPORTING_RATE` data\n", "Normally we should have \"complete\" data (no missing or `NA` values). However, when using certain datasets (from pipeline: \"Reporting Rate (Dataset)\") we might have incomplete coverage and hence `NA`s ...
\n", @@ -758,39 +461,47 @@ { "cell_type": "code", "execution_count": null, - "id": "eae2a67f", + "id": "bb7bcd61", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Check on data completeness for REPORTING RATE data: \n", - "# check how many values of REPORTING_RATE are NA\n", - "na_count <- sum(is.na(reporting_rate_month$REPORTING_RATE)) \n", - "if (na_count > 0) {\n", - " log_msg(glue(\"⚠️ Warning: Reporting Rate data contains {na_count} missing values (NA) in 'REPORTING_RATE' column.\"), \"warning\")\n", - "} else {\n", - " log_msg(\"✅ Reporting Rate data contains no missing values (NA) in 'REPORTING_RATE' column.\")\n", - "}" + "check_reporting_rate_data()" ] }, { "cell_type": "markdown", "id": "9cfa7211-5595-4ed6-9699-0f35aebcbc09", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "### 2.5. Load Care seeking data (file) \n", "\n", "Load if available" ] }, + { + "cell_type": "markdown", + "id": "c64ecce1", + "metadata": { + "tags": [] + }, + "source": [ + "### 👉 Note from GP\n", + "I'm leaving this here (not wrapped in function) because we need to decide how to handle this alternative CSB data approach ... see [SNT25-397](https://bluesquare.atlassian.net/browse/SNT25-397)" + ] + }, { "cell_type": "code", "execution_count": null, "id": "94ede0e7-e0a8-4e06-ad6c-485869b6d4a9", "metadata": { + "tags": [], "vscode": { "languageId": "r" } @@ -822,6 +533,7 @@ "execution_count": null, "id": "46abe114-0f56-48df-bae7-44147602027c", "metadata": { + "tags": [], "vscode": { "languageId": "r" } @@ -834,7 +546,9 @@ { "cell_type": "markdown", "id": "06f0ebcc-6b87-4d77-98ef-7b8d84be6a0a", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "-------------------------------" ] @@ -842,7 +556,9 @@ { "cell_type": "markdown", "id": "9943c1e5-4d95-4210-8b77-09c4085a96b8", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "## 3. Calculate Incidence\n", "First calculate monthly cases, then yearly incidence." @@ -851,7 +567,9 @@ { "cell_type": "markdown", "id": "8769a974-de8a-4a1f-8f74-edb318a28060", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "### 3.1 **Monthly cases**\n", "\n", @@ -900,7 +618,9 @@ { "cell_type": "markdown", "id": "dcee32af-ae6d-4b2a-9c7a-f846209f1dc3", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "This calculation expects (input):\n", "* **routine_data**: DHIS2 routine data, formatted and aggregated at ADM2 and MONTH level. Tibble (df) _must_ contain the following cols: `YEAR`, `MONTH`, `ADM2`, `CONF`, `TEST`, `SUSP`, `PRES`. \n", @@ -915,43 +635,26 @@ { "cell_type": "code", "execution_count": null, - "id": "a1a0899a-3308-4d90-b06e-8a0cd4b849e1", + "id": "4042ea75", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Ensure correct data type for numerical columns ---------------------------------------\n", - "routine_data <- dhis2_routine %>%\n", - " mutate(across(any_of(c(\"YEAR\", \"MONTH\", \"CONF\", \"TEST\", \"SUSP\", \"PRES\")), as.numeric))\n", - "\n", - "reporting_rate_data <- reporting_rate_month %>% # reporting_rate_data\n", - " mutate(across(c(YEAR, MONTH, REPORTING_RATE), as.numeric))" + "enforce_numeric_cols()" ] }, { "cell_type": "markdown", "id": "736dec8f", - "metadata": {}, - "source": [ - "#### 3.1.0. Aggregate at `ADM2` x `MONTH` & calculate **TPR**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8899964a", "metadata": { - "vscode": { - "languageId": "r" - } + "tags": [] }, - "outputs": [], "source": [ - "# Check for TEST > SUSP\n", - "routine_data |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() " + "#### 3.1.0. Aggregate at `ADM2` x `MONTH` & calculate **TPR**" ] }, { @@ -959,51 +662,28 @@ "execution_count": null, "id": "58bf219e", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Group & compute TPR\n", - "monthly_cases <- routine_data %>%\n", - " group_by(ADM1_ID, ADM2_ID, YEAR, MONTH) %>% # ADM1 needed to join careseeking data\n", - " summarise(\n", - " CONF = sum(CONF, na.rm = TRUE),\n", - " TEST = sum(TEST, na.rm = TRUE),\n", - " SUSP = sum(SUSP, na.rm = TRUE),\n", - " across(any_of(\"PRES\"), ~sum(., na.rm = TRUE), .names = \"PRES\"), # <- handles missing 'PRES' column gracefully\n", - " .groups = \"drop\") %>%\n", - " # Cleaning TEST data for \"SUSP-TEST\" method\n", - " mutate(TEST = ifelse(N1_METHOD == \"SUSP-TEST\" & !is.na(SUSP) & (TEST > SUSP), SUSP, TEST)) %>%\n", - " left_join(reporting_rate_data,\n", - " by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>% \n", - " # Calculate TPR based on CONF and TEST\n", - " # Note: if TEST is 0 or NA, set TPR = 1 (to avoid division by zero which produces Inf)\n", - " mutate( \n", - " TPR = ifelse(!is.na(CONF) & !is.na(TEST) & (TEST != 0), CONF / TEST, 1)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "938f0194", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check for TEST > SUSP\n", - "monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() " + "monthly_cases <- build_monthly_cases(\n", + " routine_data = routine_data,\n", + " reporting_rate_data = reporting_rate_data,\n", + " N1_METHOD = N1_METHOD,\n", + " care_seeking_data_f = care_seeking_data_f,\n", + " careseeking_data = careseeking_data\n", + ")" ] }, { "cell_type": "markdown", "id": "df43d6d8", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 3.1.1. Calculate **N1**" ] @@ -1013,38 +693,22 @@ "execution_count": null, "id": "ead591bb-3936-486d-bb9e-7b01d0805d0d", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Calculate N1 based on `N1_METHOD` & availability of `PRES` \n", - "\n", - "if (N1_METHOD == \"SUSP-TEST\") {\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(N1 = CONF + ((SUSP - TEST) * TPR))\n", - " log_msg(\"Calculating N1 as `N1 = CONF + ((SUSP - TEST) * TPR)`\")\n", - "} else if (N1_METHOD == \"PRES\") {\n", - " # if: column named \"PRES\" exists in `monthly_cases` and contains at least one non-missing value\n", - " if (\"PRES\" %in% names(monthly_cases) && !all(is.na(monthly_cases$PRES))) {\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(N1 = CONF + (PRES * TPR))\n", - " log_msg(\"ℹ️ Calculating N1 as `N1 = CONF + (PRES * TPR)`\")\n", - " } else {\n", - " log_msg(\"🚨 Warning: 'PRES' not found in routine data or contains all `NA` values! 🚨 Calculating N1 using 'SUSP-TEST' method instead.\")\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(N1 = CONF + ((SUSP - TEST) * TPR))\n", - " }\n", - "} else {\n", - " log_msg(\"Invalid N1_METHOD. Please use 'PRES' or 'SUSP-TEST'.\") # not really necessary ... \n", - "}" + "# Moved to utils in `build_monthly_cases()`." ] }, { "cell_type": "markdown", "id": "9543d283", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 3.1.2. Calculate **N2**" ] @@ -1054,82 +718,73 @@ "execution_count": null, "id": "5060017a", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Calculate N2\n", - "monthly_cases <- monthly_cases %>%\n", - " mutate(\n", - " N2 = ifelse(REPORTING_RATE == 0, NA_real_, N1 / REPORTING_RATE) # On the fly convert `RR == 0` to NA to avoid N2 == Inf\n", - " )" + "# Moved to utils in `build_monthly_cases()`." ] }, { "cell_type": "code", "execution_count": null, - "id": "debb1745-5066-4126-8a15-853b21ee8776", + "id": "390aa211", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Log msg about zero REPORTING RATE cases and warn that N2 set to NA\n", - "\n", - "zero_reporting <- reporting_rate_data %>%\n", - " filter(REPORTING_RATE == 0) %>%\n", - " summarise(\n", - " n_months_zero_reporting = n(),\n", - " affected_zones = n_distinct(ADM2_ID)\n", - " )\n", - "\n", - "if (zero_reporting$n_months_zero_reporting > 0) { \n", - " log_msg(glue(\"🚨 Note: {zero_reporting$n_months_zero_reporting} rows had `REPORTING_RATE == 0` across \",\n", - " \"{zero_reporting$affected_zones} ADM2. These N2 values were set to NA.\"))\n", - "} else {\n", - " log_msg(\"✅ Note: no ADM2 has `REPORTING_RATE == 0`. All N2 values were preserved.\")\n", - "}" + "handle_zeros_in_reporting_rate()" ] }, { "cell_type": "markdown", "id": "053f854f-ee8c-40a3-abd6-82d69cc7dca4", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 3.1.3. (optional) Compute **N3** with adjusted **N2** by 'care seeking data file' (csv)\n" ] }, + { + "cell_type": "markdown", + "id": "455a6dc3", + "metadata": { + "tags": [] + }, + "source": [ + "### 👉 Note from GP\n", + "I'm leaving this here (not wrapped in function) because we need to decide how to handle this alternative CSB data approach ... see [SNT25-397](https://bluesquare.atlassian.net/browse/SNT25-397)" + ] + }, { "cell_type": "code", "execution_count": null, "id": "5a306e23-93b7-4b8d-84ec-e4afe413a613", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "if (!is.null(care_seeking_data_f)) {\n", - " monthly_cases <- monthly_cases %>%\n", - " left_join(., care_seeking_data_f %>% select(ADM1_ID, PCT), by = c(\"ADM1_ID\")) %>%\n", - " mutate(\n", - " N3 = N2 / PCT\n", - " ) %>% \n", - " select(-PCT)\n", - " log_msg(\"N2 adjusted by care seeking data (NER Specific).\")\n", - " # head(monthly_cases)\n", - "}" + "# Moved to utils in `build_monthly_cases()`." ] }, { "cell_type": "markdown", "id": "dd9b5677", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 3.1.4. (optional) Calculate **N3**" ] @@ -1139,23 +794,14 @@ "execution_count": null, "id": "7aa926ed-99ea-474c-988e-8151d6b12002", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Only calculate N3 if CARESEEKING data is avaiable \n", - "if (!is.null(careseeking_data)) {\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(YEAR = as.numeric(YEAR)) %>% # keep as safety\n", - " left_join(., careseeking_data, by = c(\"ADM1_ID\")) %>%\n", - " mutate(\n", - " N3 = N2 + (N2 * PCT_PRIVATE_CARE / PCT_PUBLIC_CARE) + (N2 * PCT_NO_CARE / PCT_PUBLIC_CARE) \n", - " )\n", - "} else {\n", - " print(\"🦘 Careseeking data not available, skipping calculation of N3.\")\n", - "}" + "# Moved to utils in `build_monthly_cases()`." ] }, { @@ -1163,6 +809,7 @@ "execution_count": null, "id": "a67ddc0e-40ea-41d7-9fef-5e05d9594956", "metadata": { + "tags": [], "vscode": { "languageId": "r" } @@ -1175,7 +822,9 @@ { "cell_type": "markdown", "id": "fb4214ba", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 💾 Export `monthly_cases` (for 📓report notebook)\n", "For coherence checks, which need monthly resolution ... !" @@ -1184,27 +833,24 @@ { "cell_type": "code", "execution_count": null, - "id": "d80a7cb6", + "id": "6fa60920", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Save monthly_cases as .parquet file \n", - "file_path <- file.path(INTERMEDIATE_DATA_PATH, paste0(COUNTRY_CODE, \"_monthly_cases.parquet\"))\n", - "arrow::write_parquet(monthly_cases, file_path)\n", - "\n", - "# Log msg\n", - "log_msg(glue(\"Monthly cases data saved to: {file_path}\"))\n", - "head(monthly_cases)" + "export_monthly_cases(monthly_cases)" ] }, { "cell_type": "markdown", "id": "7b50302e-20af-4fa6-8e8c-1e3a6c763ea2", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "### 🔍 Data **coherence** checks on **monthly cases**\n", "Check for ratios or differences that will cause negative values -> which will causes adjusted incidence to be lower than the values it adjust\n", @@ -1219,7 +865,9 @@ { "cell_type": "markdown", "id": "e9f7ae73-46f6-4c78-9bb2-fdcfbd591b10", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 1. `PRES == 0`: causes `N1 == CONF` \n", "(if `N1_METHOD == \"PRES\"`)" @@ -1228,25 +876,24 @@ { "cell_type": "code", "execution_count": null, - "id": "495fe18a-50ad-4eff-8669-135c63a7c8dd", + "id": "f04d55a3", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Run this check only if N1_METHOD == \"PRES\" (else, problem doesn't exist)\n", - "if (N1_METHOD == \"PRES\") {\n", - " nr_of_pres_0_adm2_month <- monthly_cases |> filter(PRES == 0) |> nrow()\n", - " log_msg(glue(\"🚨 Note: using `PRES` for incidence adjustement, but `PRES == 0` for {nr_of_pres_0_adm2_month} rows (ADM2 x MONTH).\"), \"warning\")\n", - "}" + "coherence_check_PRES(monthly_cases)" ] }, { "cell_type": "markdown", "id": "e12e744b-540e-462c-a16e-edbb05ddc047", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 2. `SUSP-TEST`: if negative, then N1 smaller or equal to CONF (ADJ =< CRUDE)\n", "(if `N1_METHOD == \"SUSP-TEST\"`)" @@ -1255,30 +902,24 @@ { "cell_type": "code", "execution_count": null, - "id": "49de98f1-2424-440e-922f-72d7702dd894", + "id": "6c550c9f", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# SUSP - TEST: if negative (TEST > SUSP), then N1 smaller or equal to CONF, which then causes ADJ ≤ CRUDE\n", - "if (N1_METHOD == \"SUSP-TEST\") {\n", - " nr_of_negative <- monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() \n", - " if (nr_of_negative > 0) {\n", - " log_msg(\n", - " glue(\"🚨 Note: using formula `SUSP - TEST` for incidence adjustement, but higher tested than suspected cases (`SUSP < TEST`) detected in {nr_of_negative} rows (ADM2 x MONTH).\"),\n", - " \"warning\"\n", - " )\n", - " }\n", - "}" + "coherence_check_SUSP_TEST(monthly_cases)" ] }, { "cell_type": "markdown", "id": "d72d7545-9afa-45c6-9efd-2619aecfc794", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 3. `CONF/TEST` = `TPR` (to calculate N1: Incidence adjusted for **Testing**)\n", "This **ratio should** always be **≤ 1** because **there should _not_ be more confirmed cases than tested** ...\n", @@ -1289,25 +930,24 @@ { "cell_type": "code", "execution_count": null, - "id": "9cc60295-5046-4932-b332-965fd320f72e", + "id": "c21cd23e", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "more_confirmed_than_tested <- monthly_cases |> mutate(CONF_divby_TEST = CONF / TEST) |> filter(CONF_divby_TEST > 1) |> nrow() \n", - "\n", - "if (more_confirmed_than_tested > 0) {\n", - " log_msg(glue(\"🚨 Note: higher confirmed than tested cases (`CONF/TEST`) detected in {more_confirmed_than_tested} rows (ADM2 x MONTH).\"), \"warning\")\n", - "}" + "coherence_check_CONF_TEST(monthly_cases)" ] }, { "cell_type": "markdown", "id": "acbabb99-07ce-4054-a702-2d3cd59c328e", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "### 3.2 **Yearly incidence**\n", "After calculating N1 and N2 for each `ADM2`-`MONTH`, we aggregate the data annually to compute the yearly totals (sums) for crude cases (`CONF`), `N1` and `N2`. Finally, we compute:\n", @@ -1315,6 +955,7 @@ "* Incidence adjusted for testing: N1 / POP × 1000\n", "* Incidence adjusted for testing and reporting: N2 / POP × 1000\n", "* Incidence adjusted for testing, reporting and careseeking behaviour (optional): N3 / POP × 1000\n", + "* ⚠️⚠️⚠️ **TO DO: add option of CSB from custom source ... (`care_seeking_data_f`)** ⚠️⚠️⚠️\n", "\n", "--------------" ] @@ -1322,7 +963,9 @@ { "cell_type": "markdown", "id": "d47a3908-71cb-4e79-8771-f6caceae4ce2", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "The calculation expects (input):\n", "* **monthly_cases**: as the output of `calculate_monthly_cases()`, or a tibble/data frame with the following cols: `ADM2`, `YEAR`, `MONTH`, \"value_\" * (CONF, TEST, SUSP, PRES), `TPR`, `N1`, `N2` \n", @@ -1339,18 +982,14 @@ "execution_count": null, "id": "f8753721-067b-4da3-8305-8d98f823454f", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# ---- 1. Enforce column types upfront ----\n", - "monthly_cases <- monthly_cases %>% \n", - " mutate(across(where(is.numeric), as.numeric)) # Convert all numeric columns\n", - " \n", - "population_data <- dhis2_population_adm2 %>% # population_data\n", - " mutate(across(c(YEAR, POPULATION), as.numeric))" + "# Moved to utils in `build_yearly_incidence()`." ] }, { @@ -1358,31 +997,19 @@ "execution_count": null, "id": "e4a9ea81-4f7c-4505-8847-07de13831a42", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# ---- 2. Core calculation ----\n", - "yearly_incidence <- monthly_cases %>%\n", - " group_by(ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " # 🚨 removed `na.rm = TRUE` on 20250702 - if things break check here! 🚨 \n", - " across(c(CONF, N1, N2), ~sum(.)), #, na.rm = TRUE)), # 🔍 PROBLEM: if NA's in N2 (due to missing RR data), the sum of N2 by YEAR is smaller than the sum of N1 !\n", - " # across(any_of(c(\"CONF\", \"TEST\", \"SUSP\", \"PRES\", \"N1\", \"N2\")), ~sum(.)), # silenced as not necessary to also summarize \"TEST\", \"SUSP\", \"PRES\"\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " left_join(\n", - " population_data,\n", - " by = c(\"ADM2_ID\", \"YEAR\")\n", - " ) %>%\n", - " mutate(\n", - " INCIDENCE_CRUDE = CONF / POPULATION * 1000,\n", - " INCIDENCE_ADJ_TESTING = N1 / POPULATION * 1000,\n", - " INCIDENCE_ADJ_REPORTING = N2 / POPULATION * 1000\n", - " ) |>\n", - " ungroup()" + "yearly_incidence <- build_yearly_incidence(\n", + " monthly_cases = monthly_cases,\n", + " dhis2_population_adm2 = dhis2_population_adm2,\n", + " care_seeking_data_f = care_seeking_data_f,\n", + " careseeking_data = careseeking_data\n", + ")" ] }, { @@ -1390,31 +1017,14 @@ "execution_count": null, "id": "c712f4c2-d677-4d22-8298-111aa0a93034", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# ---- 3.1 Optional careseeking data CSV adjustment ---- \n", - "if (!is.null(care_seeking_data_f) && \"N3\" %in% names(monthly_cases)) {\n", - " n3_data <- monthly_cases %>%\n", - " group_by(ADM2_ID, YEAR) %>%\n", - " summarise(N3 = sum(N3, na.rm = TRUE),\n", - " .groups = \"drop\") |>\n", - " ungroup()\n", - " \n", - " yearly_incidence <- yearly_incidence %>%\n", - " left_join(n3_data, by = c(\"ADM2_ID\", \"YEAR\")) %>%\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000\n", - " )\n", - " } else {\n", - " yearly_incidence <- yearly_incidence |>\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = NA\n", - " )\n", - " }" + "# Moved to utils in `build_yearly_incidence()`." ] }, { @@ -1422,35 +1032,14 @@ "execution_count": null, "id": "7602a82b-8829-4613-8961-61c419073269", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# ---- 3.2 Optional careseeking adjustment ----\n", - "if (is.null(care_seeking_data_f)) { # quick fix\n", - " \n", - " if (!is.null(careseeking_data) && \"N3\" %in% names(monthly_cases)) {\n", - " n3_data <- monthly_cases %>%\n", - " group_by(ADM2_ID, YEAR) %>%\n", - " summarise(N3 = sum(N3, na.rm = TRUE),\n", - " .groups = \"drop\") |>\n", - " ungroup()\n", - " \n", - " yearly_incidence <- yearly_incidence %>%\n", - " left_join(n3_data, by = c(\"ADM2_ID\", \"YEAR\")) %>%\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000\n", - " )\n", - " } else {\n", - " yearly_incidence <- yearly_incidence |>\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = NA\n", - " )\n", - " }\n", - "\n", - "}" + "# Moved to utils in `build_yearly_incidence()`." ] }, { @@ -1458,6 +1047,7 @@ "execution_count": null, "id": "56001b15-f74e-42d9-bfa2-bd5563b6a512", "metadata": { + "tags": [], "vscode": { "languageId": "r" } @@ -1470,7 +1060,9 @@ { "cell_type": "markdown", "id": "7976f894-daf4-46c3-9fa1-5303cbba0818", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "### 🔍 Data **coherence** checks on **yearly incidence**\n", "Here we check if values of Indicidence (already at `YEAR` resolution) make sense in relation to each other.
\n", @@ -1481,7 +1073,9 @@ { "cell_type": "markdown", "id": "d3dfac34-86f5-4f8c-add9-f54485259924", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 1. `INCIDENCE_ADJ_TESTING` (adj. level 1) should always be greater than `INCIDENCE_CRUDE` (not adjusted)" ] @@ -1489,36 +1083,24 @@ { "cell_type": "code", "execution_count": null, - "id": "acb03778-f1db-4f28-9b09-3cd8d815f976", + "id": "47f0b6a8", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# same as below but different cols ... \n", - "# Count TRUE values, handling potential NAs in the result of if_else\n", - "nr_of_impossible_values <- yearly_incidence |>\n", - " mutate(IMPOSSIBLE_VALUE = if_else(INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE, TRUE, FALSE)) |>\n", - " pull(IMPOSSIBLE_VALUE) |>\n", - " sum(na.rm = TRUE) \n", - "\n", - "# Warning if any impossible values are found\n", - "if (nr_of_impossible_values > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: found {nr_of_impossible_values} rows where INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE!\"), \"warning\")\n", - "} else log_msg(\"✅ For all YEAR and ADM2, `INCIDENCE_CRUDE` is smaller than `INCIDENCE_ADJ_TESTING` (as expected).\")\n", - "\n", - "# Check if all values in a column are NA\n", - "if (all(is.na(yearly_incidence$INCIDENCE_ADJ_TESTING))) {\n", - " log_msg(\"🚨 Warning: all values of `INCIDENCE_ADJ_TESTING` are `NA`s\", \"warning\")\n", - "}\n" + "coherence_checkes_yearly_incidence(yearly_incidence, incidence_col_1 = \"INCIDENCE_CRUDE\", incidence_col_2 = \"INCIDENCE_ADJ_TESTING\")" ] }, { "cell_type": "markdown", "id": "827d1e84-7f43-404c-88cc-9b675bfa48a1", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 2. `INCIDENCE_ADJ_REPORTING` (adj. level 2) should always be greater than `INCIDENCE_ADJ_TESTING` (adj. level 1)" ] @@ -1526,35 +1108,24 @@ { "cell_type": "code", "execution_count": null, - "id": "8b1da976-b157-4e94-b5e9-8795e87bb416", + "id": "934d1520", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# Count TRUE values, handling potential NAs in the result of if_else\n", - "nr_of_impossible_values <- yearly_incidence |>\n", - " mutate(IMPOSSIBLE_VALUE = if_else(INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING, TRUE, FALSE)) |>\n", - " pull(IMPOSSIBLE_VALUE) |>\n", - " sum(na.rm = TRUE) \n", - "\n", - "# Warning if any impossible values are found\n", - "if (nr_of_impossible_values > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: found {nr_of_impossible_values} rows where INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING!\"), \"warning\")\n", - "} else log_msg(\"✅ For all YEAR and ADM2, `INCIDENCE_ADJ_TESTING` is smaller than `INCIDENCE_ADJ_REPORTING` (as expected).\")\n", - "\n", - "# Check if all values in a column are NA\n", - "if (all(is.na(yearly_incidence$INCIDENCE_ADJ_REPORTING))) {\n", - " log_msg(\"🚨 Warning: all values of `INCIDENCE_ADJ_REPORTING` are `NA`s\", \"warning\")\n", - "}" + "coherence_checkes_yearly_incidence(yearly_incidence, incidence_col_1 = \"INCIDENCE_ADJ_TESTING\", incidence_col_2 = \"INCIDENCE_ADJ_REPORTING\")" ] }, { "cell_type": "markdown", "id": "3e57f2e8-1ccc-417c-9fa6-e6b1976336bc", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "## 4. Export to `/data/dhis2_incidence/` folder" ] @@ -1562,7 +1133,9 @@ { "cell_type": "markdown", "id": "5b6861bb", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "### 4.0. Keep only essential cols \n", "Based on [SNT Pipelines Data glossary](https://docs.google.com/spreadsheets/d/1qvZMsmCWU6cVLgGZTEXsd5xmoecIxb4LAd-g_2qzYdw/edit?usp=sharing)" @@ -1573,6 +1146,7 @@ "execution_count": null, "id": "b4085515", "metadata": { + "tags": [], "vscode": { "languageId": "r" } @@ -1588,24 +1162,12 @@ ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e432998-bf85-4706-bea4-8684b0b58c16", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# save_yearly_incidence() loaded from utils/snt_dhis2_incidence.r" - ] - }, { "cell_type": "markdown", "id": "3ed49106-b335-42c3-9511-ffd864dd50f0", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "#### 👥 Population Disaggregation logic \n", "\n", @@ -1617,14 +1179,14 @@ "execution_count": null, "id": "8f15c0c4-74d9-4280-ba46-8add435a9147", "metadata": { + "tags": [], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "# if (COUNTRY_CODE == \"NER\" & INDICATORS_FOUND) {\n", - "if (INDICATORS_FOUND) {\n", + "if (DISAGGREGATED_INDICATORS_FOUND) {\n", " log_msg(glue(\"ℹ️ The results have been computed using the following Indicators: {paste(target_colnames, collapse=', ')}\"))\n", " log_msg(glue(\"ℹ️ The results have been computed using the following Population: {POPULATION_SELECTION}\"))\n", "}" @@ -1635,6 +1197,7 @@ "execution_count": null, "id": "16e7d83b-3962-4041-9d2d-aaa362b62d5f", "metadata": { + "tags": [], "vscode": { "languageId": "r" } @@ -1644,10 +1207,10 @@ "# Export the data\n", "\n", "# CSV\n", - "save_yearly_incidence(yearly_incidence, DATA_PATH, \".csv\", write_csv, COUNTRY_CODE)\n", + "save_yearly_incidence(yearly_incidence, DATA_PATH, \".csv\", write_csv)\n", "\n", "# Parquet\n", - "save_yearly_incidence(yearly_incidence, DATA_PATH, \".parquet\", arrow::write_parquet, COUNTRY_CODE)" + "save_yearly_incidence(yearly_incidence, DATA_PATH, \".parquet\", arrow::write_parquet)" ] } ], @@ -1664,7 +1227,8 @@ "name": "R", "pygments_lexer": "r", "version": "4.4.3" - } + }, + "papermill": {} }, "nbformat": 4, "nbformat_minor": 5 diff --git a/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r b/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r index 7242035..ec3d180 100644 --- a/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r +++ b/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r @@ -1,44 +1,550 @@ -# Shared helpers for snt_dhis2_incidence notebooks. +# Store code to be sourced in the notebook in this same directory, so that the main notebook +# only shows the code relevant to the analysis, and not the boring routine setup, import and export. +# Each piece of code is wrapped in a function to keep the notebook clean. -resolve_routine_filename <- function(routine_choice) { - if (routine_choice == "raw") return("_routine.parquet") - is_removed <- FALSE - if (routine_choice == "raw_without_outliers") is_removed <- TRUE - removed_status <- if (is_removed) "removed" else "imputed" - return(glue::glue("_routine_outliers_{removed_status}.parquet")) +message("This step sets up the environment for the DHIS2 incidence pipeline, including paths, config, and utility functions. +It basically handles all the boring stuff so that you can focus on the code that matters :) +") + +setup_paths <- function() { +SNT_ROOT_PATH <<- "/home/hexa/workspace" +CODE_PATH <<- file.path(SNT_ROOT_PATH, 'code') +CONFIG_PATH <<- file.path(SNT_ROOT_PATH, 'configuration') +DATA_PATH <<- file.path(SNT_ROOT_PATH, 'data', 'dhis2', 'incidence') +INTERMEDIATE_DATA_PATH <<- file.path(DATA_PATH, "intermediate_results") +message("Paths set up:") +message("CODE_PATH: ", CODE_PATH) +message("CONFIG_PATH: ", CONFIG_PATH) +message("DATA_PATH: ", DATA_PATH) +message("INTERMEDIATE_DATA_PATH: ", INTERMEDIATE_DATA_PATH) +} + +create_intermediate_data_dir <- function() { +if (!dir.exists(INTERMEDIATE_DATA_PATH)) { +dir.create(INTERMEDIATE_DATA_PATH, recursive = TRUE) +log_msg(glue("Created directory for intermediate results: {INTERMEDIATE_DATA_PATH}")) +} } -infer_reporting_routine_choice <- function(reporting_parameters) { - if (is.null(reporting_parameters)) return(NULL) - if (length(names(reporting_parameters)) == 0) return(NULL) +load_utils <- function() { +utils_path <- "/home/hexa/workspace/code/snt_utils.r" +palettes_path <- "/home/hexa/workspace/code/snt_palettes.r" +source("/home/hexa/workspace/code/snt_utils.r") +message("Utils loaded from: ", utils_path) +source("/home/hexa/workspace/code/snt_palettes.r") +message("Palettes loaded from: ", palettes_path) +} + +set_env_openhexa <- function() { +Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") +Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") +Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") +message("Python config:") +print(reticulate::py_config()) +openhexa <<- import("openhexa.sdk") +message("Openhexa SDK imported successfully.") +} + +import_config_json <- function() { +config_path <- file.path(CONFIG_PATH, "SNT_config.json") +config_json <<- tryCatch({ fromJSON(config_path) }, +error = function(e) { +msg <- paste0("[ERROR] Error while loading configuration: ", conditionMessage(e)) +cat(msg) +stop(msg) +}) +log_msg(glue("SNT configuration loaded from: {config_path}")) +} + +config_generic <- function() { +COUNTRY_CODE <<- config_json$SNT_CONFIG$COUNTRY_CODE +ADMIN_1 <<- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1) +ADMIN_2 <<- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2) +log_msg(glue("Configuration values set: COUNTRY_CODE={COUNTRY_CODE}, ADMIN_1={ADMIN_1}, ADMIN_2={ADMIN_2}")) +} + +config_incidence <- function() { +DHIS2_INDICATORS <<- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) +log_msg(glue("DHIS2 indicators set: {paste(DHIS2_INDICATORS, collapse=', ')}")) +} - if ("ROUTINE_FILE" %in% names(reporting_parameters)) { - routine_file <- as.character(reporting_parameters$ROUTINE_FILE[[1]]) - if (grepl("_routine_outliers_removed\\.parquet$", routine_file)) return("raw_without_outliers") - if (grepl("_routine_outliers_imputed\\.parquet$", routine_file)) return("imputed") - if (grepl("_routine\\.parquet$", routine_file)) return("raw") +set_fixed_cols <- function() { +fixed_cols <<- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') +log_msg(glue("Fixed routine data ('dhis2_routine') columns set: {paste(fixed_cols, collapse=', ')}")) +} + +resolve_routine_filename <- function() { +if (ROUTINE_DATA_CHOICE == "raw") return("_routine.parquet") +is_removed <<- FALSE +if (ROUTINE_DATA_CHOICE == "raw_without_outliers") is_removed <<- TRUE +removed_status <<- if (is_removed) "removed" else "imputed" +return(glue::glue("_routine_outliers_{removed_status}.parquet")) +} + +select_routine_dataset_and_filename <- function() { +if (ROUTINE_DATA_CHOICE == "raw") { +routine_dataset_name <<- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED +routine_name <<- resolve_routine_filename() +routine_filename <<- paste0(COUNTRY_CODE, routine_name) +} else { +routine_dataset_name <<- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION +routine_name <<- resolve_routine_filename() +routine_filename <<- paste0(COUNTRY_CODE, routine_name) +} +log_msg(glue("Selected routine dataset: {routine_dataset_name}, filename: {routine_filename}")) +} + +load_dhis2_routine_data <- function() { +dhis2_routine <<- tryCatch({ get_latest_dataset_file_in_memory(routine_dataset_name, routine_filename) }, +error = function(e) { +if (grepl("does not exist", conditionMessage(e), ignore.case = TRUE)) { +msg <- paste0("[ERROR] File not found! 🛑 The file ", routine_filename, " does not exist in ", routine_dataset_name, ". To generate it, execute the pipeline DHIS2 Outliers Removal and Imputation.") +} else { +msg <- paste0("[ERROR] 🛑 Error while loading DHIS2 routine data file : ", routine_filename, ". [ERROR DETAILS] " , conditionMessage(e)) +} +stop(msg) +}) +log_msg(paste0("DHIS2 routine data : ", routine_filename, " loaded. Dims: ", paste(dim(dhis2_routine), collapse=", "))) +return(head(dhis2_routine, 3)) +} + +check_fixed_cols_in_routine <- function() { +actual_cols <- colnames(dhis2_routine) +missing_cols <- setdiff(fixed_cols, actual_cols) +if (length(missing_cols) == 0) { +log_msg(paste0("All expected 'fixed' columns present.")) +} else { +log_msg(paste0("🚨 Missing Columns: ", paste(missing_cols, collapse = ", ")), "warning") +} +} + +check_dhis2_indicators_cols_in_routine <- function() { +actual_cols <- colnames(dhis2_routine) +missing_cols <- setdiff(DHIS2_INDICATORS, actual_cols) +if (length(missing_cols) == 0) { +log_msg("All DHIS2 indicators present in 'dhis2_routine'.") +} else { +log_msg(paste0("🚨 Missing DHIS2 INDICATORS: ", paste(missing_cols, collapse = ", ")), "warning") +} +} + +check_PRES_col <- function() { +if (exists("N1_METHOD") && N1_METHOD == "PRES") { +pres_in_routine <- any(names(dhis2_routine) == "PRES") +pres_in_config <- any(DHIS2_INDICATORS == "PRES") + if (!pres_in_routine) { + log_msg("🛑 Column `PRES` missing from routine data!", "error") + stop() + } + log_msg("Column `PRES` is present. Proceeding.") +} else { + # This is just for the nb, no need to long in pipeline run + print("N1_METHOD is not set to 'PRES'. No need to check for `PRES` column.") } +} + +load_population_data <- function() { +dhis2_pop_dataset <- if (USE_ADJUSTED_POPULATION) config_json$SNT_DATASET_IDENTIFIERS$DHIS2_POPULATION_TRANSFORMATION else config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED + +dhis2_population_adm2 <<- get_latest_dataset_file_in_memory(dhis2_pop_dataset, paste0(COUNTRY_CODE, "_population.parquet")) +log_msg(glue("DHIS2 population data loaded from {dhis2_pop_dataset}.")) +return(head(dhis2_population_adm2, 3)) + - if ("REPORTING_RATE_METHOD" %in% names(reporting_parameters)) return("raw") - return(NULL) } -save_yearly_incidence <- function(yearly_incidence, data_path, file_extension, write_function, country_code = NULL) { - if (is.null(country_code) && exists("COUNTRY_CODE")) { - country_code <- get("COUNTRY_CODE") +# --- DISAGGREGATION LOGIC --- --- --- --- --- --- --- + +# DISAGGREGATED_INDICATORS_FOUND <- FALSE # 👈 + +# if (!is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c("SUSP-TEST", "PRES")) { +# # Determine the expected column names based on the disaggregation selection and method +# prefix_method <- ifelse(N1_METHOD == "SUSP-TEST", "SUSP", "PRES") +# prefix_fixed <- c("TEST", "CONF") +# prefix_all <- c(prefix_method, prefix_fixed) +# target_colnames <- glue("{prefix_all}_{DISAGGREGATION_SELECTION}") + +# if (all(target_colnames %in% colnames(dhis2_routine))) { +# # Map the disaggregated columns (e.g., SUSP_UNDER_5) to generic names (e.g., SUSP) so that +# # the rest of the pipeline can use them without needing to know about the disaggregation +# dhis2_routine[prefix_all] <- dhis2_routine[target_colnames] +# for (col in target_colnames) { +# log_msg(glue("Population Disaggregation: Successfully mapped indicator: {col}")) +# } +# # Signal success for the next code block +# DISAGGREGATED_INDICATORS_FOUND <- TRUE # 👈 +# } else { +# missing_cols <- setdiff(target_colnames, colnames(dhis2_routine)) +# log_msg(glue("Population Disaggregation: Disaggregation on '{DISAGGREGATION_SELECTION}' failed."), "warning") +# log_msg(glue("Population Disaggregation: Missing columns in routine dataset: {paste(missing_cols, collapse = ', ')}"), "warning") +# msg <- glue("[ERROR] 🛑 Population Disaggregation: Required columns for disaggregation '{DISAGGREGATION_SELECTION}' are missing.") +# stop(msg) +# } +# } else { +# # Print just in nb (not in pipeline logs) +# print("Population Disaggregation: No disaggregation applied based on the current configuration.") +# } + +prepare_disaggregated_indicators <- function(dhis2_routine, DISAGGREGATION_SELECTION, N1_METHOD) { + # Initialize the flag locally + DISAGGREGATED_INDICATORS_FOUND <<- FALSE + + if (!is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c("SUSP-TEST", "PRES")) { + # Determine the expected column names based on the disaggregation selection and method + prefix_method <- ifelse(N1_METHOD == "SUSP-TEST", "SUSP", "PRES") + prefix_fixed <- c("TEST", "CONF") + prefix_all <- c(prefix_method, prefix_fixed) + target_colnames <<- glue::glue("{prefix_all}_{DISAGGREGATION_SELECTION}") + + if (all(target_colnames %in% colnames(dhis2_routine))) { + # Map the disaggregated columns (e.g., SUSP_UNDER_5) to generic names (e.g., SUSP) so that + # the rest of the pipeline can use them without needing to know about the disaggregation + dhis2_routine[prefix_all] <- dhis2_routine[target_colnames] + + for (col in target_colnames) { + log_msg(glue::glue("Population Disaggregation: Successfully mapped indicator: {col}")) + } + + # Signal success for the next code block + DISAGGREGATED_INDICATORS_FOUND <<- TRUE + } else { + missing_cols <- setdiff(target_colnames, colnames(dhis2_routine)) + log_msg(glue::glue("Population Disaggregation: Disaggregation on '{DISAGGREGATION_SELECTION}' failed."), "warning") + log_msg(glue::glue("Population Disaggregation: Missing columns in routine dataset: {paste(missing_cols, collapse = ', ')}"), "warning") + msg <- glue::glue("[ERROR] 🛑 Population Disaggregation: Required columns for disaggregation '{DISAGGREGATION_SELECTION}' are missing.") + stop(msg) } - if (is.null(country_code)) { - stop("country_code is required to export yearly incidence.") + } else { + # Print just in nb (not in pipeline logs) + print("Population Disaggregation: No disaggregation applied based on the current configuration.") + } + # return(dhis2_routine) + dhis2_routine <<- dhis2_routine +} + +# if (DISAGGREGATED_INDICATORS_FOUND) { +# POPULATION_SELECTION <- paste0("POP_", DISAGGREGATION_SELECTION) +# if (!(POPULATION_SELECTION %in% colnames(dhis2_population_adm2))) { +# log_msg(glue("Population Disaggregation: Column '{POPULATION_SELECTION}' not found in Population dataset."), "warning") +# POPULATION_SELECTION <- "POPULATION" +# } +# # The selected column is assigned to POPULATION col so that later code can use it generically +# dhis2_population_adm2$POPULATION <- dhis2_population_adm2[[POPULATION_SELECTION]] +# log_msg(glue("Population Disaggregation: Column '{POPULATION_SELECTION}' selected as population values.")) +# } + +select_population_column <- function(dhis2_population_adm2, DISAGGREGATED_INDICATORS_FOUND, DISAGGREGATION_SELECTION) { + # Default value for the selection if the condition isn't met or if it fails + POPULATION_SELECTION <<- "POPULATION" + if (DISAGGREGATED_INDICATORS_FOUND) { + POPULATION_SELECTION <<- paste0("POP_", DISAGGREGATION_SELECTION) + if (!(POPULATION_SELECTION %in% colnames(dhis2_population_adm2))) { + log_msg(glue::glue("Population Disaggregation: Column '{POPULATION_SELECTION}' not found in Population dataset."), "warning") + POPULATION_SELECTION <<- "POPULATION" + } + # The selected column is assigned to POPULATION col so that later code can use it generically + dhis2_population_adm2$POPULATION <- dhis2_population_adm2[[POPULATION_SELECTION]] + log_msg(glue::glue("Population Disaggregation: Column '{POPULATION_SELECTION}' selected as population values.")) + } else { + # Print just in nb (not in pipeline logs) + print("Population Disaggregation: No disaggregation applied based on the current configuration.") + } + dhis2_population_adm2 <<- dhis2_population_adm2 +} + +# --- --- --- --- --- --- --- --- --- + + +load_careseeking_data <- function() { + if (USE_CSB_DATA == TRUE) { + dataset_name <<- config_json$SNT_DATASET_IDENTIFIERS$DHS_INDICATORS + file_name <<- glue::glue("{COUNTRY_CODE}_DHS_ADM1_PCT_CARESEEKING_SAMPLE_AVERAGE.parquet") + careseeking_data <<- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, + error = function(e) { + msg <- paste("🛑 Error while loading DHS Care Seeking data file from `", dataset_name, file_name ,"`.", conditionMessage(e)) # log error message + log_msg(msg, "error") + return(NULL) # make object NULL on error + }) + if (!is.null(careseeking_data)) { + log_msg(paste0("Care Seeking data : ", file_name, " loaded from dataset : ", dataset_name)) + log_msg(paste0("Care Seeking data frame dimensions: ", nrow(careseeking_data), " rows, ", ncol(careseeking_data), " columns.")) + head(careseeking_data) + } else { + log_msg(paste0("🚨 Care-seeking data not loaded due to an error, `careseeking_data` is set to `NULL`!"), "warning") + } + } else { + careseeking_data <<- NULL + print("USE_CSB_DATA is set to FALSE. Care-seeking data will be ignored and `careseeking_data` is set to `NULL`.") } +} - file_name <- paste0(country_code, "_incidence", file_extension) - file_path <- file.path(data_path, file_name) - output_dir <- dirname(file_path) - if (!dir.exists(output_dir)) { - dir.create(output_dir, recursive = TRUE) +load_reporting_rate_data <- function() { + rr_dataset_name <<- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE + file_name_de <<- paste0(COUNTRY_CODE, "_reporting_rate_dataelement.parquet") + file_name_ds <<- paste0(COUNTRY_CODE, "_reporting_rate_dataset.parquet") + reporting_rate_month <<- tryCatch({ + df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_de) + log_msg(glue("Reporting Rate data: `{file_name_de}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df_loaded), collapse=', ')}")) + REPORTING_RATE_METHOD <<- "dataelement" + df_loaded + }, + error = function(e) { + cat(glue("[ERROR] Error while loading Reporting Rate 'dataelement' version for: {COUNTRY_CODE} {conditionMessage(e)}")) + return(NULL) + }) + if (is.null(reporting_rate_month)) { + reporting_rate_month <<- tryCatch({ + df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_ds) + log_msg(glue("Reporting Rate data: `{file_name_ds}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df_loaded), collapse=', ')}")) + REPORTING_RATE_METHOD <<- "dataset" + df_loaded + }, + error = function(e) { + stop(glue("[ERROR] Error while loading Reporting Rate 'dataset' version for: {COUNTRY_CODE} {conditionMessage(e)}")) # raise error + }) } + rm(df_loaded) + log_msg(glue("Final Reporting Rate ({REPORTING_RATE_METHOD}) data frame dimensions: {paste(dim(reporting_rate_month), collapse=', ')}")) + head(reporting_rate_month, 3) +} + - write_function(yearly_incidence, file_path) - log_msg(paste0("Exporting : ", file_path)) +check_reporting_rate_data <- function() { + if (!is.null(reporting_rate_month)) { + na_count <<- sum(is.na(reporting_rate_month$REPORTING_RATE)) + if (na_count > 0) { + log_msg(glue("⚠️ Warning: Reporting Rate data contains {na_count} missing values (NA) in 'REPORTING_RATE' column."), "warning") + } else { + log_msg("✅ Reporting Rate data contains no missing values (NA) in 'REPORTING_RATE' column.") + } + } else { + log_msg("🚨 Reporting Rate data frame is NULL. Cannot check for missing values.", "error") + } } + + +enforce_numeric_cols <- function() { + routine_data <<- dhis2_routine |> + mutate(across(any_of(c("YEAR", "MONTH", "CONF", "TEST", "SUSP", "PRES")), as.numeric)) + log_msg("Created 'routine_data' dataframe. Ensured correct data types for DHIS2 routine data numerical columns: YEAR, MONTH, CONF, TEST, SUSP, PRES.") + if (!is.null(reporting_rate_month)) { + reporting_rate_data <<- reporting_rate_month |> + mutate(across(c(YEAR, MONTH, REPORTING_RATE), as.numeric)) + log_msg("Created 'reporting_rate_data' dataframe. Ensured correct data types for Reporting Rate data numerical columns: YEAR, MONTH, REPORTING_RATE.") + } else { + log_msg("Reporting Rate data frame is NULL. Skipping data type enforcement for Reporting Rate.", "warning") + } +} + + +handle_zeros_in_reporting_rate <- function() { + if (!is.null(reporting_rate_data)) { + zero_reporting <<- reporting_rate_data %>% + filter(REPORTING_RATE == 0) %>% + summarise( + n_months_zero_reporting = n(), + affected_zones = n_distinct(ADM2_ID) + ) + if (zero_reporting$n_months_zero_reporting > 0) { + log_msg(glue("🚨 Note: {zero_reporting$n_months_zero_reporting} rows had `REPORTING_RATE == 0` across ", + "{zero_reporting$affected_zones} ADM2. These N2 values were set to NA.")) + } else { + log_msg("✅ Note: no ADM2 has `REPORTING_RATE == 0`. All N2 values were preserved.") + } + } else { + log_msg("🚨 Reporting Rate data frame is NULL. Cannot check for zero reporting rates.", "error") + } +} + + +build_monthly_cases <- function( + routine_data, + reporting_rate_data, + N1_METHOD, + care_seeking_data_f = NULL, + careseeking_data = NULL +) { + monthly_cases <- routine_data |> + dplyr::group_by(ADM1_ID, ADM2_ID, YEAR, MONTH) |> + dplyr::summarise( + CONF = sum(CONF, na.rm = TRUE), + TEST = sum(TEST, na.rm = TRUE), + SUSP = sum(SUSP, na.rm = TRUE), + dplyr::across(dplyr::any_of("PRES"), ~sum(., na.rm = TRUE), .names = "PRES"), + .groups = "drop" + ) |> + dplyr::mutate(TEST = ifelse(N1_METHOD == "SUSP-TEST" & !is.na(SUSP) & (TEST > SUSP), SUSP, TEST)) |> + dplyr::left_join(reporting_rate_data, by = c("ADM2_ID", "YEAR", "MONTH")) |> + dplyr::mutate(TPR = ifelse(!is.na(CONF) & !is.na(TEST) & (TEST != 0), CONF / TEST, 1)) + + if (N1_METHOD == "SUSP-TEST") { + monthly_cases <- monthly_cases %>% + dplyr::mutate(N1 = CONF + ((SUSP - TEST) * TPR)) + log_msg("Calculating N1 as `N1 = CONF + ((SUSP - TEST) * TPR)`") + } else if (N1_METHOD == "PRES") { + if ("PRES" %in% names(monthly_cases) && !all(is.na(monthly_cases$PRES))) { + monthly_cases <- monthly_cases %>% + dplyr::mutate(N1 = CONF + (PRES * TPR)) + log_msg("ℹ️ Calculating N1 as `N1 = CONF + (PRES * TPR)`") + } else { + log_msg("🚨 Warning: 'PRES' not found in routine data or contains all `NA` values! 🚨 Calculating N1 using 'SUSP-TEST' method instead.") + monthly_cases <- monthly_cases %>% + dplyr::mutate(N1 = CONF + ((SUSP - TEST) * TPR)) + } + } else { + log_msg("Invalid N1_METHOD. Please use 'PRES' or 'SUSP-TEST'.") + } + + monthly_cases <- monthly_cases %>% + dplyr::mutate(N2 = ifelse(REPORTING_RATE == 0, NA_real_, N1 / REPORTING_RATE)) + + if (!is.null(care_seeking_data_f)) { + monthly_cases <- monthly_cases %>% + dplyr::left_join(care_seeking_data_f %>% dplyr::select(ADM1_ID, PCT), by = c("ADM1_ID")) %>% + dplyr::mutate(N3 = N2 / PCT) %>% + dplyr::select(-PCT) + log_msg("N2 adjusted by care seeking data (NER Specific).") + } + + if (!is.null(careseeking_data)) { + monthly_cases <- monthly_cases |> + dplyr::mutate(YEAR = as.numeric(YEAR)) |> + dplyr::left_join(careseeking_data, by = c("ADM1_ID")) |> + dplyr::mutate( + N3 = N2 + (N2 * PCT_PRIVATE_CARE / PCT_PUBLIC_CARE) + (N2 * PCT_NO_CARE / PCT_PUBLIC_CARE) + ) + } else { + print("🦘 Careseeking data not available, skipping calculation of N3.") + } + + monthly_cases +} + + +build_yearly_incidence <- function(monthly_cases, dhis2_population_adm2, care_seeking_data_f = NULL, careseeking_data = NULL) { + monthly_cases <- monthly_cases %>% + dplyr::mutate(dplyr::across(where(is.numeric), as.numeric)) + + population_data <- dhis2_population_adm2 %>% + dplyr::mutate(dplyr::across(c(YEAR, POPULATION), as.numeric)) + + yearly_incidence <- monthly_cases %>% + dplyr::group_by(ADM2_ID, YEAR) %>% + dplyr::summarise( + dplyr::across(c(CONF, N1, N2), ~sum(.)), + .groups = "drop" + ) %>% + dplyr::left_join( + population_data, + by = c("ADM2_ID", "YEAR") + ) %>% + dplyr::mutate( + INCIDENCE_CRUDE = CONF / POPULATION * 1000, + INCIDENCE_ADJ_TESTING = N1 / POPULATION * 1000, + INCIDENCE_ADJ_REPORTING = N2 / POPULATION * 1000 + ) |> + dplyr::ungroup() + + if (!is.null(care_seeking_data_f) && "N3" %in% names(monthly_cases)) { + n3_data <- monthly_cases %>% + dplyr::group_by(ADM2_ID, YEAR) %>% + dplyr::summarise(N3 = sum(N3, na.rm = TRUE), .groups = "drop") |> + dplyr::ungroup() + + yearly_incidence <- yearly_incidence %>% + dplyr::left_join(n3_data, by = c("ADM2_ID", "YEAR")) %>% + dplyr::mutate(INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000) + } else if (!is.null(careseeking_data) && "N3" %in% names(monthly_cases)) { + n3_data <- monthly_cases %>% + dplyr::group_by(ADM2_ID, YEAR) %>% + dplyr::summarise(N3 = sum(N3, na.rm = TRUE), .groups = "drop") |> + dplyr::ungroup() + + yearly_incidence <- yearly_incidence %>% + dplyr::left_join(n3_data, by = c("ADM2_ID", "YEAR")) %>% + dplyr::mutate(INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000) + } else { + yearly_incidence <- yearly_incidence |> + dplyr::mutate(INCIDENCE_ADJ_CARESEEKING = NA) + } + + yearly_incidence +} + + +export_monthly_cases <- function(monthly_cases) { + file_path <- file.path(INTERMEDIATE_DATA_PATH, paste0(COUNTRY_CODE, "_monthly_cases.parquet")) + arrow::write_parquet(monthly_cases, file_path) + log_msg(glue("Monthly cases data saved to: {file_path}")) +} + + +coherence_check_PRES <- function(monthly_cases) { + # Run this check only if N1_METHOD == "PRES" (else, problem doesn't exist) + if (N1_METHOD == "PRES") { + nr_of_pres_0_adm2_month <<- monthly_cases |> filter(PRES == 0) |> nrow() + log_msg(glue("🚨 Note: using `PRES` for incidence adjustement, but `PRES == 0` for {nr_of_pres_0_adm2_month} rows (ADM2 x MONTH)."), "warning") + } else { + log_msg("N1_METHOD is not set to 'PRES'. No need to check for coherence of `PRES` column.") + } +} + +coherence_check_SUSP_TEST <- function(monthly_cases) { + # Logically, there should not be more tested cases than suspected cases. + if (N1_METHOD == "SUSP-TEST") { + nr_of_negative <<- monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() + if (nr_of_negative > 0) { + log_msg( + glue("🚨 Note: using formula `SUSP - TEST` for incidence adjustement, but higher tested than suspected cases (`SUSP < TEST`) detected in {nr_of_negative} rows (ADM2 x MONTH)."), + "warning" + ) + } else { + log_msg("✅ Note: using `SUSP - TEST` for incidence adjustment, no cases where `TEST > SUSP` detected.") + } + } else { + log_msg("N1_METHOD is not set to 'SUSP-TEST'. No need to check for coherence of `SUSP` and `TEST` columns.") + } +} + + +coherence_check_CONF_TEST <- function(monthly_cases) { + more_confirmed_than_tested <<- monthly_cases |> mutate(CONF_divby_TEST = CONF / TEST) |> filter(CONF_divby_TEST > 1) |> nrow() + if (more_confirmed_than_tested > 0) { + log_msg(glue("🚨 Note: higher confirmed than tested cases (`CONF/TEST`) detected in {more_confirmed_than_tested} rows (ADM2 x MONTH)."), "warning") + } else { + log_msg("✅ Note: no cases where `CONF > TEST` detected.") + } +} + + +coherence_checkes_yearly_incidence <- function(yearly_incidence, incidence_col_1, incidence_col_2) { + nr_of_impossible_values <<- yearly_incidence |> + mutate(IMPOSSIBLE_VALUE = if_else(!!sym(incidence_col_2) < !!sym(incidence_col_1), TRUE, FALSE)) |> + pull(IMPOSSIBLE_VALUE) |> + sum(na.rm = TRUE) + if (nr_of_impossible_values > 0) { + log_msg(glue::glue("🚨 Warning: found {nr_of_impossible_values} rows where {incidence_col_2} < {incidence_col_1}!"), "warning") + } else log_msg(glue::glue("✅ For all YEAR and ADM2, `{incidence_col_1}` is smaller than `{incidence_col_2}` (as expected).")) + # Check if all values in the column are NA, which indicates that the adjustment method did not work for any ADM2 and month, which is a problem. + if (all(is.na(yearly_incidence[[incidence_col_2]]))) { + log_msg(glue::glue("🚨 Warning: all values of `{incidence_col_2}` are `NA`s"), "warning") + } +} + + +# Reusable function to generate filename and save data --------------------------------------------- +save_yearly_incidence <- function(yearly_incidence, data_path, file_extension, write_function) { + base_name_parts <- c(COUNTRY_CODE, "_incidence") + # --- Concatenate all parts to form the final filename --- + file_name <- paste0(c(base_name_parts, file_extension), collapse = "") + file_path <- file.path(data_path, file_name) + output_dir <- dirname(file_path) + # --- Check if the output directory exists, else create it --- + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) + } + # --- Flexibility to use function as provided in argument: "write_csv" or "arrow::write_parquet" ... --- + write_function(yearly_incidence, file_path) + log_msg(paste0("Exporting : ", file_path)) +} \ No newline at end of file From 9b61636fc7fd63c0e52e7ecdba1926d300c9c2cf Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 31 Mar 2026 10:43:42 +0200 Subject: [PATCH 18/23] should be it --- ...snt_dhis2_reporting_rate_dataelement.ipynb | 2220 ++++++++--------- .../snt_dhis2_reporting_rate_dataelement.r | 183 ++ .../snt_dhis2_reporting_rate_dataset.ipynb | 227 +- ..._dhis2_reporting_rate_dataset_report.ipynb | 7 +- .../utils/snt_dhis2_reporting_rate_dataset.r | 109 + 5 files changed, 1362 insertions(+), 1384 deletions(-) create mode 100644 pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r create mode 100644 pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index 63b0504..ac7cce0 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -1,1232 +1,1076 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2026-01-16T10:22:53.011120", - "exception": false, - "start_time": "2026-01-16T10:22:53.010947", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# Data Element reporting rate: based on reporting of one or more indicators\n", - "Partially following methods by WHO and as per Diallo (2025) paper\n", - "\n", - "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", - "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", - "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", - "\n", - "Specifically: \n", - "\n", - "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", - " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", - "
\n", - "
\n", - "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", - " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", - " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", - " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", - "
\n", - "
\n", - "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", - " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", - " * Filename: `XXX_reporting_rate_dataelement.`" - ] - }, - { - "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", - "metadata": { - "papermill": { - "duration": 0.000228, - "end_time": "2026-01-16T10:22:53.014752", - "exception": false, - "start_time": "2026-01-16T10:22:53.014524", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", - "metadata": { - "papermill": { - "duration": 63.150489, - "end_time": "2026-01-16T10:23:56.165530", - "exception": false, - "start_time": "2026-01-16T10:22:53.015041", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", - "metadata": { - "papermill": { - "duration": 0.00011, - "end_time": "2026-01-16T10:23:56.165873", - "exception": false, - "start_time": "2026-01-16T10:23:56.165763", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 1.1. Fallback parameters values\n", - "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", - "The code cell below here provides fallback paramater values needed when running this notebook locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", - "metadata": { - "papermill": { - "duration": 0.033954, - "end_time": "2026-01-16T10:23:56.199937", - "exception": false, - "start_time": "2026-01-16T10:23:56.165983", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Current options: \n", - "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", - "# \"COUNTRY_CODE_routine_outliers_removed.parquet\" \n", - "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", - "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"XXX_routine_outliers_imputed.parquet\"}\n", - "\n", - "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", - "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", - "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", - "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" - ] - }, - { - "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", - "metadata": { - "papermill": { - "duration": 0.000095, - "end_time": "2026-01-16T10:23:56.200231", - "exception": false, - "start_time": "2026-01-16T10:23:56.200136", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 1.2. Load and check `snt config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", - "metadata": { - "papermill": { - "duration": 0.521572, - "end_time": "2026-01-16T10:23:56.721932", - "exception": false, - "start_time": "2026-01-16T10:23:56.200360", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", - "metadata": { - "papermill": { - "duration": 0.033003, - "end_time": "2026-01-16T10:23:56.755117", - "exception": false, - "start_time": "2026-01-16T10:23:56.722114", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", - "# 🚨 NOTE (2025-01-09): The configuration field `NA_TREATMENT` has been removed from SNT_config.json files.\n", - "# It was legacy code from Ousmane and was only used for Reporting Rate calculations (not anymore).\n", - "# It has been replaced by `0_VALUES_PRESERVED` (boolean: true/false) which specifies whether zero values\n", - "# are stored in the DHIS2 instance (true) or converted to NULL to save space (false).\n", - "# See: https://bluesquare.atlassian.net/browse/SNT25-158\n", - "# The variable `NA_TREATMENT` is kept here for backward compatibility but is no longer loaded from config.\n", - "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", - "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) \n", - "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", - "\n", - "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", - "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", - "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ] - }, - { - "cell_type": "markdown", - "id": "8bf4a8bb", - "metadata": {}, - "source": [ - "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18b40207", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!length(ACTIVITY_INDICATORS) > 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", - "metadata": { - "papermill": { - "duration": 0.000093, - "end_time": "2026-01-16T10:23:56.779812", - "exception": false, - "start_time": "2026-01-16T10:23:56.779719", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", - "metadata": { - "papermill": { - "duration": 0.000069, - "end_time": "2026-01-16T10:23:56.779987", - "exception": false, - "start_time": "2026-01-16T10:23:56.779918", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.1. Routine data (DHIS2) \n", - "**Note on pipeline behaviour**:
\n", - "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." - ] + "cells": [ + { + "cell_type": "markdown", + "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2026-01-16T10:22:53.011120", + "exception": false, + "start_time": "2026-01-16T10:22:53.010947", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", - "metadata": { - "papermill": { - "duration": 2.018878, - "end_time": "2026-01-16T10:23:58.798963", - "exception": false, - "start_time": "2026-01-16T10:23:56.780085", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# select dataset\n", - "if (ROUTINE_FILE == glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "} else {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - "}\n", - " \n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", - "\n", - "# log\n", - "log_msg(glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset: {rountine_dataset_name}. Dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)" - ] + "tags": [] + }, + "source": [ + "# Data Element reporting rate: based on reporting of one or more indicators\n", + "Partially following methods by WHO and as per Diallo (2025) paper\n", + "\n", + "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", + "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", + "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", + "\n", + "Specifically: \n", + "\n", + "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", + " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", + "
\n", + "
\n", + "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", + " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", + " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", + " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", + "
\n", + "
\n", + "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", + " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", + " * Filename: `XXX_reporting_rate_dataelement.`" + ] + }, + { + "cell_type": "markdown", + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", + "metadata": { + "papermill": { + "duration": 0.000228, + "end_time": "2026-01-16T10:22:53.014752", + "exception": false, + "start_time": "2026-01-16T10:22:53.014524", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", - "metadata": { - "papermill": { - "duration": 0.000138, - "end_time": "2026-01-16T10:23:58.799287", - "exception": false, - "start_time": "2026-01-16T10:23:58.799149", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.2. Organisation units (DHIS2 pyramid)" - ] + "tags": [] + }, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35ede7cf-257f-439c-a514-26a7290f881d", + "metadata": { + "papermill": { + "duration": 63.150489, + "end_time": "2026-01-16T10:23:56.165530", + "exception": false, + "start_time": "2026-01-16T10:22:53.015041", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fd92901-901e-4019-be78-a7718050c1c4", - "metadata": { - "papermill": { - "duration": 0.992899, - "end_time": "2026-01-16T10:23:59.792385", - "exception": false, - "start_time": "2026-01-16T10:23:58.799486", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load file from dataset\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - " \n", - "msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset: `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", - "log_msg(msg)\n", - "dim(dhis2_pyramid_formatted)\n", - "head(dhis2_pyramid_formatted,2)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Project paths\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\")\n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", + "\n", + "# Load libraries\n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\", \"zoo\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Environment variables\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load OpenHEXA sdk\n", + "openhexa <- import(\"openhexa.sdk\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", + "metadata": { + "papermill": { + "duration": 0.00011, + "end_time": "2026-01-16T10:23:56.165873", + "exception": false, + "start_time": "2026-01-16T10:23:56.165763", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", - "metadata": { - "papermill": { - "duration": 0.000106, - "end_time": "2026-01-16T10:23:59.792710", - "exception": false, - "start_time": "2026-01-16T10:23:59.792604", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.3. Check whether selected indicators are present in routine data\n", - "Extra precaution measure to avoid breaks downstream.
\n", - "\n", - "Note: This logic should be moved to pipeline.py 🐍" - ] + "tags": [] + }, + "source": [ + "### 1.1. Fallback parameters values\n", + "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", + "The code cell below here provides fallback paramater values needed when running this notebook locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", + "metadata": { + "papermill": { + "duration": 0.033954, + "end_time": "2026-01-16T10:23:56.199937", + "exception": false, + "start_time": "2026-01-16T10:23:56.165983", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", - "metadata": { - "papermill": { - "duration": 0.024863, - "end_time": "2026-01-16T10:23:59.817677", - "exception": false, - "start_time": "2026-01-16T10:23:59.792814", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", - "}\n", - "\n", - "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", - " cat(msg)\n", - " stop(msg)\n", - "}" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Current options: \n", + "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", + "# \"COUNTRY_CODE_routine_outliers_removed.parquet\" \n", + "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", + "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"XXX_routine_outliers_imputed.parquet\"}\n", + "\n", + "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", + "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", + "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", + "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", + "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" + ] + }, + { + "cell_type": "markdown", + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", + "metadata": { + "papermill": { + "duration": 9.5e-05, + "end_time": "2026-01-16T10:23:56.200231", + "exception": false, + "start_time": "2026-01-16T10:23:56.200136", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", - "metadata": { - "papermill": { - "duration": 0.000091, - "end_time": "2026-01-16T10:23:59.817949", - "exception": false, - "start_time": "2026-01-16T10:23:59.817858", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 3. Reporting rates computations" - ] + "tags": [] + }, + "source": [ + "### 1.2. Load and check `snt config` file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", + "metadata": { + "papermill": { + "duration": 0.521572, + "end_time": "2026-01-16T10:23:56.721932", + "exception": false, + "start_time": "2026-01-16T10:23:56.200360", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "7d62cdb6", - "metadata": {}, - "source": [ - "#### 3.0. Define start and end period based on routine data " - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", + "metadata": { + "papermill": { + "duration": 0.033003, + "end_time": "2026-01-16T10:23:56.755117", + "exception": false, + "start_time": "2026-01-16T10:23:56.722114", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", - "metadata": { - "papermill": { - "duration": 0.044172, - "end_time": "2026-01-16T10:23:59.862224", - "exception": false, - "start_time": "2026-01-16T10:23:59.818052", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", - "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", - "\n", - "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", - "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", + "# 🚨 NOTE (2025-01-09): The configuration field `NA_TREATMENT` has been removed from SNT_config.json files.\n", + "# It was legacy code from Ousmane and was only used for Reporting Rate calculations (not anymore).\n", + "# It has been replaced by `0_VALUES_PRESERVED` (boolean: true/false) which specifies whether zero values\n", + "# are stored in the DHIS2 instance (true) or converted to NULL to save space (false).\n", + "# See: https://bluesquare.atlassian.net/browse/SNT25-158\n", + "# The variable `NA_TREATMENT` is kept here for backward compatibility but is no longer loaded from config.\n", + "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", + "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) \n", + "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", + "\n", + "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", + "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", + "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" + ] + }, + { + "cell_type": "markdown", + "id": "8bf4a8bb", + "metadata": {}, + "source": [ + "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", + "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", + "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18b40207", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!length(ACTIVITY_INDICATORS) > 0) {\n", + " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", + " cat(msg) \n", + " stop(msg)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", + "metadata": { + "papermill": { + "duration": 9.3e-05, + "end_time": "2026-01-16T10:23:56.779812", + "exception": false, + "start_time": "2026-01-16T10:23:56.779719", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:23:59.862555", - "exception": false, - "start_time": "2026-01-16T10:23:59.862446", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.1. Build master table (all PERIOD x OU)\n", - "The master table contains all combinations of period x organisation unit " - ] + "tags": [] + }, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", + "metadata": { + "papermill": { + "duration": 6.9e-05, + "end_time": "2026-01-16T10:23:56.779987", + "exception": false, + "start_time": "2026-01-16T10:23:56.779918", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "9308197a-0852-4d34-8888-cf5564f35a9d", - "metadata": { - "papermill": { - "duration": 0.289128, - "end_time": "2026-01-16T10:24:00.151791", - "exception": false, - "start_time": "2026-01-16T10:23:59.862663", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", - "\n", - "facility_master <- dhis2_pyramid_formatted %>%\n", - " rename(\n", - " OU_ID = glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\"),\n", - " OU_NAME = glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_NAME\"),\n", - " ADM2_ID = str_replace(ADMIN_2, \"NAME\", \"ID\"),\n", - " ADM2_NAME = all_of(ADMIN_2),\n", - " ADM1_ID = str_replace(ADMIN_1, \"NAME\", \"ID\"),\n", - " ADM1_NAME = all_of(ADMIN_1)\n", - " ) %>%\n", - " select(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME, OPENING_DATE, CLOSED_DATE) %>%\n", - " distinct() %>%\n", - " tidyr::crossing(PERIOD = period_vector) %>%\n", - " mutate(PERIOD=as.numeric(PERIOD))\n", - " " - ] + "tags": [] + }, + "source": [ + "### 2.1. Routine data (DHIS2) \n", + "**Note on pipeline behaviour**:
\n", + "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1213723-f7e2-4238-9f37-f1795b187232", + "metadata": { + "papermill": { + "duration": 2.018878, + "end_time": "2026-01-16T10:23:58.798963", + "exception": false, + "start_time": "2026-01-16T10:23:56.780085", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", - "metadata": { - "papermill": { - "duration": 0.000114, - "end_time": "2026-01-16T10:24:00.152094", - "exception": false, - "start_time": "2026-01-16T10:24:00.151980", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.2. Identify \"Active\" facilities\n", - "\n", - "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "rountine_dataset_name <- select_routine_dataset_name_dataelement(ROUTINE_FILE, COUNTRY_CODE, config_json)\n", + "dhis2_routine <- load_routine_data_dataelement(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE)\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 2)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", + "metadata": { + "papermill": { + "duration": 0.000138, + "end_time": "2026-01-16T10:23:58.799287", + "exception": false, + "start_time": "2026-01-16T10:23:58.799149", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b279d27", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Assessing facility reporting activity based on the following indicators: {paste(ACTIVITY_INDICATORS, collapse=', ')}\"))\n", - "\n", - "facility_master_routine <- left_join(\n", - " facility_master,\n", - " # dhis2_routine %>% select(OU_ID, PERIOD, all_of(DHIS2_INDICATORS)), # GP 2026-02-04\n", - " dhis2_routine %>% select(OU_ID, PERIOD, any_of(DHIS2_INDICATORS)), \n", - " by = c(\"OU_ID\", \"PERIOD\")\n", - " ) %>%\n", - " mutate(\n", - " YEAR = as.numeric(substr(PERIOD, 1, 4)),\n", - " ACTIVE_THIS_PERIOD = ifelse(\n", - " rowSums(!is.na(across(all_of(ACTIVITY_INDICATORS))) & across(all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0), \n", - " COUNT = 1 # Counting every facility\n", - " )" - ] + "tags": [] + }, + "source": [ + "### 2.2. Organisation units (DHIS2 pyramid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fd92901-901e-4019-be78-a7718050c1c4", + "metadata": { + "papermill": { + "duration": 0.992899, + "end_time": "2026-01-16T10:23:59.792385", + "exception": false, + "start_time": "2026-01-16T10:23:58.799486", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", - "metadata": { - "papermill": { - "duration": 0.000107, - "end_time": "2026-01-16T10:24:01.626760", - "exception": false, - "start_time": "2026-01-16T10:24:01.626653", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.3. Identify `OPEN` facilities (denominator)\n", - "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", - "\n", - "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", - "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", - "\n", - "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", - "\n", - "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhis2_pyramid_formatted <- load_pyramid_data_dataelement(config_json, COUNTRY_CODE)\n", + "dim(dhis2_pyramid_formatted)\n", + "head(dhis2_pyramid_formatted, 2)\n" + ] + }, + { + "cell_type": "markdown", + "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", + "metadata": { + "papermill": { + "duration": 0.000106, + "end_time": "2026-01-16T10:23:59.792710", + "exception": false, + "start_time": "2026-01-16T10:23:59.792604", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", - "metadata": { - "papermill": { - "duration": 1.317923, - "end_time": "2026-01-16T10:24:02.944800", - "exception": false, - "start_time": "2026-01-16T10:24:01.626877", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "facility_master_routine <- facility_master_routine %>%\n", - " mutate(\n", - " period_date = as.Date(ym(PERIOD)),\n", - " \n", - " # Flag facilities explicitly marked as closed in their name\n", - " NAME_CLOSED = str_detect(\n", - " toupper(OU_NAME),\n", - " \"CLOTUR|FERM(E|EE)?\"\n", - " ),\n", - "\n", - " # Check whether the facility is open during the period using open/close dates\n", - " OPEN_BY_DATE = \n", - " !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date |\n", - " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)\n", - " ),\n", - " \n", - " # Final definition of an open facility for the period:\n", - " # not explicitly closed, within opening/closing dates,\n", - " # and started reporting\n", - " OPEN = ifelse(\n", - " !NAME_CLOSED & OPEN_BY_DATE,\n", - " 1, 0\n", - " )\n", - " )" - ] + "tags": [] + }, + "source": [ + "### 2.3. Check whether selected indicators are present in routine data\n", + "Extra precaution measure to avoid breaks downstream.
\n", + "\n", + "Note: This logic should be moved to pipeline.py 🐍" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", + "metadata": { + "papermill": { + "duration": 0.024863, + "end_time": "2026-01-16T10:23:59.817677", + "exception": false, + "start_time": "2026-01-16T10:23:59.792814", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "657fd6ca", - "metadata": {}, - "source": [ - "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", + "}\n", + "\n", + "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", + " cat(msg)\n", + " stop(msg)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", + "metadata": { + "papermill": { + "duration": 9.1e-05, + "end_time": "2026-01-16T10:23:59.817949", + "exception": false, + "start_time": "2026-01-16T10:23:59.817858", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "a598e4b7", - "metadata": {}, - "source": [ - "
\n", - " Important: this step could have a huge influence on reporting rates!
\n", - " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", - "
    \n", - "
  • With YEAR → “active that year”
  • \n", - "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", - "
\n", - "
" - ] + "tags": [] + }, + "source": [ + "## 3. Reporting rates computations" + ] + }, + { + "cell_type": "markdown", + "id": "7d62cdb6", + "metadata": {}, + "source": [ + "#### 3.0. Define start and end period based on routine data " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", + "metadata": { + "papermill": { + "duration": 0.044172, + "end_time": "2026-01-16T10:23:59.862224", + "exception": false, + "start_time": "2026-01-16T10:23:59.818052", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", - "metadata": { - "papermill": { - "duration": 0.173961, - "end_time": "2026-01-16T10:24:05.948136", - "exception": false, - "start_time": "2026-01-16T10:24:05.774175", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Flag facilities with at least one report in the year\n", - "facility_master_routine_01 <- facility_master_routine %>%\n", - " group_by(OU_ID, YEAR) %>%\n", - " mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>% # use max() to flag if ACTIVE_THIS_PERIOD is 1 at least once\n", - " ungroup()" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", + "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", + "\n", + "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", + "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:23:59.862555", + "exception": false, + "start_time": "2026-01-16T10:23:59.862446", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", - "metadata": { - "papermill": { - "duration": 0.000098, - "end_time": "2026-01-16T10:24:05.948452", - "exception": false, - "start_time": "2026-01-16T10:24:05.948354", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.5. Compute Weighting factor based on \"volume of activity\"" - ] + "tags": [] + }, + "source": [ + "#### 3.1. Build master table (all PERIOD x OU)\n", + "The master table contains all combinations of period x organisation unit " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9308197a-0852-4d34-8888-cf5564f35a9d", + "metadata": { + "papermill": { + "duration": 0.289128, + "end_time": "2026-01-16T10:24:00.151791", + "exception": false, + "start_time": "2026-01-16T10:23:59.862663", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "4420e559-4134-4fc3-8950-9972ebede00e", - "metadata": { - "papermill": { - "duration": 0.520673, - "end_time": "2026-01-16T10:24:06.469233", - "exception": false, - "start_time": "2026-01-16T10:24:05.948560", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Computing volume of activity using indicator: {paste(VOLUME_ACTIVITY_INDICATORS, collapse=', ')}\"))\n", - "\n", - "# Compute MEAN_REPORTED_CASES_BY_HF as total cases over months with activity\n", - "mean_monthly_cases <- dhis2_routine %>% \n", - " mutate(total_cases_by_hf_month = rowSums(across(all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>%\n", - " group_by(ADM2_ID, OU_ID) %>% \n", - " summarise(\n", - " total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE),\n", - " number_of_reporting_months = length(which(total_cases_by_hf_month > 0)),\n", - " .groups = \"drop\"\n", - " ) %>% \n", - " mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>%\n", - " select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF)\n", - "\n", - "mean_monthly_cases_adm2 <- mean_monthly_cases %>% \n", - " select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>% \n", - " group_by(ADM2_ID) %>% \n", - " summarise(SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm=TRUE), \n", - " NR_OF_HF = n())\n", - "\n", - "# Compute weights\n", - "hf_weights <- mean_monthly_cases %>% \n", - " left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", - " mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n", - "\n", - "# Join with rest of data\n", - "facility_master_routine_02 <- facility_master_routine_01 %>%\n", - " left_join(hf_weights %>% select(OU_ID, WEIGHT), by = c(\"OU_ID\"))" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", + "facility_master <- build_facility_master_dataelement(\n", + " dhis2_pyramid_formatted = dhis2_pyramid_formatted,\n", + " period_vector = period_vector,\n", + " config_json = config_json,\n", + " ADMIN_1 = ADMIN_1,\n", + " ADMIN_2 = ADMIN_2\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", + "metadata": { + "papermill": { + "duration": 0.000114, + "end_time": "2026-01-16T10:24:00.152094", + "exception": false, + "start_time": "2026-01-16T10:24:00.151980", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:06.469622", - "exception": false, - "start_time": "2026-01-16T10:24:06.469514", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.6. Compute Weighted variables" - ] + "tags": [] + }, + "source": [ + "#### 3.2. Identify \"Active\" facilities\n", + "\n", + "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b279d27", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "reporting_rate_dataelement <- compute_reporting_rate_dataelement(\n", + " facility_master = facility_master,\n", + " dhis2_routine = dhis2_routine,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " ACTIVITY_INDICATORS = ACTIVITY_INDICATORS,\n", + " VOLUME_ACTIVITY_INDICATORS = VOLUME_ACTIVITY_INDICATORS,\n", + " DATAELEMENT_METHOD_DENOMINATOR = DATAELEMENT_METHOD_DENOMINATOR,\n", + " USE_WEIGHTED_REPORTING_RATES = USE_WEIGHTED_REPORTING_RATES\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", + "metadata": { + "papermill": { + "duration": 0.000107, + "end_time": "2026-01-16T10:24:01.626760", + "exception": false, + "start_time": "2026-01-16T10:24:01.626653", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", - "metadata": { - "papermill": { - "duration": 0.483413, - "end_time": "2026-01-16T10:24:06.953139", - "exception": false, - "start_time": "2026-01-16T10:24:06.469726", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Computing weighted variables for reporting rate calculation.\"))\n", - "\n", - "facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT \n", - "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n", - "\n", - "dim(facility_master_routine_02)\n", - "head(facility_master_routine_02, 2)" - ] + "tags": [] + }, + "source": [ + "#### 3.3. Identify `OPEN` facilities (denominator)\n", + "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", + "\n", + "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", + "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", + "\n", + "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", + "\n", + "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", + "metadata": { + "papermill": { + "duration": 1.317923, + "end_time": "2026-01-16T10:24:02.944800", + "exception": false, + "start_time": "2026-01-16T10:24:01.626877", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2026-01-16T10:24:06.953755", - "exception": false, - "start_time": "2026-01-16T10:24:06.953583", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.7. Aggregate data at ADM2 level" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "657fd6ca", + "metadata": {}, + "source": [ + "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" + ] + }, + { + "cell_type": "markdown", + "id": "a598e4b7", + "metadata": {}, + "source": [ + "
\n", + " Important: this step could have a huge influence on reporting rates!
\n", + " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", + "
    \n", + "
  • With YEAR → “active that year”
  • \n", + "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", + "
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", + "metadata": { + "papermill": { + "duration": 0.173961, + "end_time": "2026-01-16T10:24:05.948136", + "exception": false, + "start_time": "2026-01-16T10:24:05.774175", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "af13191e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Aggregating data at admin level 2.\"))\n", - "\n", - "reporting_rate_adm2 <- facility_master_routine_02 %>% \n", - " group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>%\n", - " summarise(\n", - " HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE), # (numerator) sum of all facilities active per PERIOD\n", - " NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE),\n", - " NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE), # (denominator) sum of all facilities active at least once in the YEAR\n", - " HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE),\n", - " NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE),\n", - " NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE), \n", - " .groups = \"drop\")\n", - "\n", - "dim(reporting_rate_adm2)\n", - "# head(reporting_rate_adm2, 5)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", + "metadata": { + "papermill": { + "duration": 9.8e-05, + "end_time": "2026-01-16T10:24:05.948452", + "exception": false, + "start_time": "2026-01-16T10:24:05.948354", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "7d381937", - "metadata": {}, - "source": [ - "#### 3.8. Calculate Reporting Rates (all methods)" - ] + "tags": [] + }, + "source": [ + "#### 3.5. Compute Weighting factor based on \"volume of activity\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4420e559-4134-4fc3-8950-9972ebede00e", + "metadata": { + "papermill": { + "duration": 0.520673, + "end_time": "2026-01-16T10:24:06.469233", + "exception": false, + "start_time": "2026-01-16T10:24:05.948560", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "b41263f8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Calculating Reporting Rates at admin level 2. Using all methods, weighted and unweighted.\"))\n", - "\n", - "reporting_rate_adm2 <- reporting_rate_adm2 %>% \n", - " mutate(\n", - " RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2,\n", - " RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2,\n", - " RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2,\n", - " RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED,\n", - " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", - " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", - " )\n", - "\n", - "dim(reporting_rate_adm2)\n", - "head(reporting_rate_adm2, 5)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:06.469622", + "exception": false, + "start_time": "2026-01-16T10:24:06.469514", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "5e593659", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:07.310579", - "exception": false, - "start_time": "2026-01-16T10:24:07.310471", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 4. Select correct col for `REPORTING_RATE` based on denominator method" - ] + "tags": [] + }, + "source": [ + "#### 3.6. Compute Weighted variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", + "metadata": { + "papermill": { + "duration": 0.483413, + "end_time": "2026-01-16T10:24:06.953139", + "exception": false, + "start_time": "2026-01-16T10:24:06.469726", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "c75f2249", - "metadata": { - "papermill": { - "duration": 0.000057, - "end_time": "2026-01-16T10:24:07.310743", - "exception": false, - "start_time": "2026-01-16T10:24:07.310686", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 4.1. Select results and format" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2026-01-16T10:24:06.953755", + "exception": false, + "start_time": "2026-01-16T10:24:06.953583", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "75e71b38", - "metadata": { - "papermill": { - "duration": 0.020644, - "end_time": "2026-01-16T10:24:07.351317", - "exception": false, - "start_time": "2026-01-16T10:24:07.330673", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") { \n", - " rr_column_selection <- \"RR_ACTIVE_HF\" \n", - " if (USE_WEIGHTED_REPORTING_RATES) {\n", - " rr_column_selection <- \"RR_ACTIVE_HF_W\"\n", - " }\n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " rr_column_selection <- \"RR_OPEN_HF\"\n", - " if (USE_WEIGHTED_REPORTING_RATES) {\n", - " rr_column_selection <- \"RR_OPEN_HF_W\"\n", - " }\n", - "}" - ] + "tags": [] + }, + "source": [ + "#### 3.7. Aggregate data at ADM2 level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af13191e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "7d381937", + "metadata": {}, + "source": [ + "#### 3.8. Calculate Reporting Rates (all methods)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b41263f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "5e593659", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:07.310579", + "exception": false, + "start_time": "2026-01-16T10:24:07.310471", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "3df36abb", - "metadata": { - "papermill": { - "duration": 0.140976, - "end_time": "2026-01-16T10:24:07.492479", - "exception": false, - "start_time": "2026-01-16T10:24:07.351503", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Using reporting rate column: `{rr_column_selection}` \n", - "based on DATAELEMENT_METHOD_DENOMINATOR == {DATAELEMENT_METHOD_DENOMINATOR} \n", - "and USE_WEIGHTED_REPORTING_RATES == {USE_WEIGHTED_REPORTING_RATES}\"))" - ] + "tags": [] + }, + "source": [ + "## 4. Select correct col for `REPORTING_RATE` based on denominator method" + ] + }, + { + "cell_type": "markdown", + "id": "c75f2249", + "metadata": { + "papermill": { + "duration": 5.7e-05, + "end_time": "2026-01-16T10:24:07.310743", + "exception": false, + "start_time": "2026-01-16T10:24:07.310686", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ccc272c", - "metadata": { - "papermill": { - "duration": 0.182574, - "end_time": "2026-01-16T10:24:07.675242", - "exception": false, - "start_time": "2026-01-16T10:24:07.492668", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Formatting table for '{DATAELEMENT_METHOD_DENOMINATOR}' selection.\"))\n", - "\n", - "# Select column and format final table\n", - "reporting_rate_dataelement <- reporting_rate_adm2 %>%\n", - " mutate(MONTH = PERIOD %% 100) %>%\n", - " rename(REPORTING_RATE = !!sym(rr_column_selection)) %>%\n", - " select(all_of(fixed_cols_rr))\n", - "\n", - "print(dim(reporting_rate_dataelement))\n", - "head(reporting_rate_dataelement, 3)" - ] + "tags": [] + }, + "source": [ + "### 4.1. Select results and format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75e71b38", + "metadata": { + "papermill": { + "duration": 0.020644, + "end_time": "2026-01-16T10:24:07.351317", + "exception": false, + "start_time": "2026-01-16T10:24:07.330673", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "ca66e785", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:24:07.675637", - "exception": false, - "start_time": "2026-01-16T10:24:07.675528", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 5. Inspect reporting rate values" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3df36abb", + "metadata": { + "papermill": { + "duration": 0.140976, + "end_time": "2026-01-16T10:24:07.492479", + "exception": false, + "start_time": "2026-01-16T10:24:07.351503", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "31535459", - "metadata": { - "papermill": { - "duration": 0.160299, - "end_time": "2026-01-16T10:24:07.836039", - "exception": false, - "start_time": "2026-01-16T10:24:07.675740", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", - "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", - "xlab=\"REPORTING_RATE\")" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ccc272c", + "metadata": { + "papermill": { + "duration": 0.182574, + "end_time": "2026-01-16T10:24:07.675242", + "exception": false, + "start_time": "2026-01-16T10:24:07.492668", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "6778f17d", - "metadata": { - "papermill": { - "duration": 0.896382, - "end_time": "2026-01-16T10:24:08.732660", - "exception": false, - "start_time": "2026-01-16T10:24:07.836278", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Boxplot\n", - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - " geom_boxplot(outlier.alpha = 0.3) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ca66e785", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:24:07.675637", + "exception": false, + "start_time": "2026-01-16T10:24:07.675528", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7f013fd", - "metadata": { - "papermill": { - "duration": 0.859448, - "end_time": "2026-01-16T10:24:09.592295", - "exception": false, - "start_time": "2026-01-16T10:24:08.732847", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - "# Boxplot without outliers\n", - " geom_boxplot(outlier.alpha = 0) +\n", - " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] + "tags": [] + }, + "source": [ + "## 5. Inspect reporting rate values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31535459", + "metadata": { + "papermill": { + "duration": 0.160299, + "end_time": "2026-01-16T10:24:07.836039", + "exception": false, + "start_time": "2026-01-16T10:24:07.675740", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "2866816a-7015-4c5c-b904-f553f3b4790d", - "metadata": { - "papermill": { - "duration": 0.000088, - "end_time": "2026-01-16T10:24:09.592563", - "exception": false, - "start_time": "2026-01-16T10:24:09.592475", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 5. 📁 Export to `data/` folder" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", + "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", + "xlab=\"REPORTING_RATE\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6778f17d", + "metadata": { + "papermill": { + "duration": 0.896382, + "end_time": "2026-01-16T10:24:08.732660", + "exception": false, + "start_time": "2026-01-16T10:24:07.836278", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", - "metadata": { - "papermill": { - "duration": 0.919937, - "end_time": "2026-01-16T10:24:10.512602", - "exception": false, - "start_time": "2026-01-16T10:24:09.592665", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", - "\n", - "# parquet\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\"))\n", - "write_parquet(reporting_rate_dataelement, file_path)\n", - "log_msg(glue(\"Exported : {file_path}\"))\n", - "\n", - "# csv\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\"))\n", - "write.csv(reporting_rate_dataelement, file_path, row.names = FALSE)\n", - "log_msg(glue(\"Exported : {file_path}\"))" - ] + "tags": [], + "vscode": { + "languageId": "r" } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + }, + "outputs": [], + "source": [ + "# Boxplot\n", + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + " geom_boxplot(outlier.alpha = 0.3) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7f013fd", + "metadata": { + "papermill": { + "duration": 0.859448, + "end_time": "2026-01-16T10:24:09.592295", + "exception": false, + "start_time": "2026-01-16T10:24:08.732847", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + "# Boxplot without outliers\n", + " geom_boxplot(outlier.alpha = 0) +\n", + " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "markdown", + "id": "2866816a-7015-4c5c-b904-f553f3b4790d", + "metadata": { + "papermill": { + "duration": 8.8e-05, + "end_time": "2026-01-16T10:24:09.592563", + "exception": false, + "start_time": "2026-01-16T10:24:09.592475", + "status": "completed" }, + "tags": [] + }, + "source": [ + "## 5. 📁 Export to `data/` folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", + "metadata": { "papermill": { - "default_parameters": {}, - "duration": 81.158347, - "end_time": "2026-01-16T10:24:10.736106", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", - "parameters": { - "AVAILABILITY_INDICATORS": [ - "CONF", - "PRES", - "SUSP", - "TEST" - ], - "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", - "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace", - "USE_WEIGHTED_REPORTING_RATES": true, - "VOLUME_ACTIVITY_INDICATORS": [ - "CONF", - "PRES" - ] - }, - "start_time": "2026-01-16T10:22:49.577759", - "version": "2.6.0" + "duration": 0.919937, + "end_time": "2026-01-16T10:24:10.512602", + "exception": false, + "start_time": "2026-01-16T10:24:09.592665", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "export_reporting_rate_dataelement(\n", + " reporting_rate_dataelement = reporting_rate_dataelement,\n", + " DATA_PATH = DATA_PATH,\n", + " COUNTRY_CODE = COUNTRY_CODE\n", + ")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, - "nbformat": 4, - "nbformat_minor": 5 + "papermill": { + "default_parameters": {}, + "duration": 81.158347, + "end_time": "2026-01-16T10:24:10.736106", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", + "parameters": { + "AVAILABILITY_INDICATORS": [ + "CONF", + "PRES", + "SUSP", + "TEST" + ], + "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", + "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace", + "USE_WEIGHTED_REPORTING_RATES": true, + "VOLUME_ACTIVITY_INDICATORS": [ + "CONF", + "PRES" + ] + }, + "start_time": "2026-01-16T10:22:49.577759", + "version": "2.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r new file mode 100644 index 0000000..c73601b --- /dev/null +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r @@ -0,0 +1,183 @@ +select_routine_dataset_name_dataelement <- function(ROUTINE_FILE, COUNTRY_CODE, config_json) { + if (ROUTINE_FILE == glue::glue("{COUNTRY_CODE}_routine.parquet")) { + return(config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED) + } + config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION +} + + +load_routine_data_dataelement <- function(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE) { + dhis2_routine <- tryCatch({ + get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) + }, error = function(e) { + msg <- paste("[ERROR] Error while loading DHIS2 routine data file for: ", COUNTRY_CODE, conditionMessage(e)) + cat(msg) + stop(msg) + }) + + dhis2_routine <- dhis2_routine %>% + dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric)) + + log_msg(glue::glue( + "DHIS2 routine file {ROUTINE_FILE} loaded from dataset: {rountine_dataset_name}. Dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}" + )) + + dhis2_routine +} + + +load_pyramid_data_dataelement <- function(config_json, COUNTRY_CODE) { + dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED + + dhis2_pyramid_formatted <- tryCatch({ + get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_pyramid.parquet")) + }, error = function(e) { + msg <- paste("Error while loading DHIS2 pyramid FORMATTED data file for: ", COUNTRY_CODE, conditionMessage(e)) + cat(msg) + stop(msg) + }) + + log_msg(paste0( + "DHIS2 pyramid FORMATTED data loaded from dataset: `", dataset_name, + "`. Dataframe dimensions: ", paste(dim(dhis2_pyramid_formatted), collapse = ", ") + )) + + dhis2_pyramid_formatted +} + + +build_facility_master_dataelement <- function( + dhis2_pyramid_formatted, + period_vector, + config_json, + ADMIN_1, + ADMIN_2 +) { + dhis2_pyramid_formatted %>% + dplyr::rename( + OU_ID = glue::glue("LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID"), + OU_NAME = glue::glue("LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_NAME"), + ADM2_ID = stringr::str_replace(ADMIN_2, "NAME", "ID"), + ADM2_NAME = dplyr::all_of(ADMIN_2), + ADM1_ID = stringr::str_replace(ADMIN_1, "NAME", "ID"), + ADM1_NAME = dplyr::all_of(ADMIN_1) + ) %>% + dplyr::select(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME, OPENING_DATE, CLOSED_DATE) %>% + dplyr::distinct() %>% + tidyr::crossing(PERIOD = period_vector) %>% + dplyr::mutate(PERIOD = as.numeric(PERIOD)) +} + + +compute_reporting_rate_dataelement <- function( + facility_master, + dhis2_routine, + DHIS2_INDICATORS, + ACTIVITY_INDICATORS, + VOLUME_ACTIVITY_INDICATORS, + DATAELEMENT_METHOD_DENOMINATOR, + USE_WEIGHTED_REPORTING_RATES +) { + facility_master_routine <- dplyr::left_join( + facility_master, + dhis2_routine %>% dplyr::select(OU_ID, PERIOD, dplyr::any_of(DHIS2_INDICATORS)), + by = c("OU_ID", "PERIOD") + ) %>% + dplyr::mutate( + YEAR = as.numeric(substr(PERIOD, 1, 4)), + ACTIVE_THIS_PERIOD = ifelse( + rowSums(!is.na(dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS))) & + dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0 + ), + COUNT = 1 + ) %>% + dplyr::mutate( + period_date = as.Date(zoo::as.yearmon(as.character(PERIOD), "%Y%m")), + NAME_CLOSED = stringr::str_detect(toupper(OU_NAME), "CLOTUR|FERM(E|EE)?"), + OPEN_BY_DATE = !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date | + (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)), + OPEN = ifelse(!NAME_CLOSED & OPEN_BY_DATE, 1, 0) + ) %>% + dplyr::group_by(OU_ID, YEAR) %>% + dplyr::mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>% + dplyr::ungroup() + + mean_monthly_cases <- dhis2_routine %>% + dplyr::mutate(total_cases_by_hf_month = rowSums(dplyr::across(dplyr::all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>% + dplyr::group_by(ADM2_ID, OU_ID) %>% + dplyr::summarise( + total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE), + number_of_reporting_months = length(which(total_cases_by_hf_month > 0)), + .groups = "drop" + ) %>% + dplyr::mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>% + dplyr::select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF) + + mean_monthly_cases_adm2 <- mean_monthly_cases %>% + dplyr::select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>% + dplyr::group_by(ADM2_ID) %>% + dplyr::summarise( + SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm = TRUE), + NR_OF_HF = dplyr::n() + ) + + hf_weights <- mean_monthly_cases %>% + dplyr::left_join(mean_monthly_cases_adm2, by = "ADM2_ID") %>% + dplyr::mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF) + + facility_master_routine_02 <- facility_master_routine %>% + dplyr::left_join(hf_weights %>% dplyr::select(OU_ID, WEIGHT), by = c("OU_ID")) + + facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT + facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT + facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT + facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT + + reporting_rate_adm2 <- facility_master_routine_02 %>% + dplyr::group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>% + dplyr::summarise( + HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE), + NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE), + NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE), + HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE), + HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE), + NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE), + NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE), + HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE), + .groups = "drop" + ) %>% + dplyr::mutate( + RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2, + RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2, + RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2, + RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED, + RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED, + RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED + ) + + rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == "ROUTINE_ACTIVE_FACILITIES") "RR_ACTIVE_HF" else "RR_OPEN_HF" + if (USE_WEIGHTED_REPORTING_RATES) { + rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == "ROUTINE_ACTIVE_FACILITIES") "RR_ACTIVE_HF_W" else "RR_OPEN_HF_W" + } + + reporting_rate_adm2 %>% + dplyr::mutate(MONTH = PERIOD %% 100) %>% + dplyr::rename(REPORTING_RATE = !!rlang::sym(rr_column_selection)) %>% + dplyr::select(YEAR, MONTH, ADM2_ID, REPORTING_RATE) +} + + +export_reporting_rate_dataelement <- function(reporting_rate_dataelement, DATA_PATH, COUNTRY_CODE) { + output_data_path <- file.path(DATA_PATH, "reporting_rate") + if (!dir.exists(output_data_path)) { + dir.create(output_data_path, recursive = TRUE) + } + + file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataelement.parquet")) + arrow::write_parquet(reporting_rate_dataelement, file_path) + log_msg(glue::glue("Exported : {file_path}")) + + file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataelement.csv")) + write.csv(reporting_rate_dataelement, file_path, row.names = FALSE) + log_msg(glue::glue("Exported : {file_path}")) +} diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index 18d1d35..f26b3b6 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -61,7 +61,7 @@ "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", "metadata": { "papermill": { - "duration": 0.000092, + "duration": 9.2e-05, "end_time": "2025-12-19T10:21:50.273573", "exception": false, "start_time": "2025-12-19T10:21:50.273481", @@ -99,16 +99,18 @@ "outputs": [], "source": [ "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataset\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\")\n", "\n", "# Load utils\n", "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataset.r\"))\n", "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"glue\", \"jsonlite\", \"httr\", \"reticulate\") \n", + "# Load libraries\n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"glue\", \"jsonlite\", \"httr\", \"reticulate\")\n", "install_and_load(required_packages)\n", "\n", "# Environment variables\n", @@ -117,7 +119,7 @@ "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", "\n", "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "openhexa <- import(\"openhexa.sdk\")\n" ] }, { @@ -373,34 +375,10 @@ }, "outputs": [], "source": [ - "# select dataset\n", - "if (ROUTINE_FILE == glue::glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "} else {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - "}\n", - "\n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", - "\n", - "# Subset data to keep only columns defined in fixed_cols_rr (if defined)\n", - "if (exists(\"fixed_cols_rr\")) {\n", - " dhis2_routine <- dhis2_routine %>% \n", - " select(any_of(fixed_cols_rr)) |> \n", - " distinct()\n", - "}\n", - "\n", - "# log\n", - "log_msg(glue::glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset : {rountine_dataset_name} dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", + "rountine_dataset_name <- select_routine_dataset_name_dataset(ROUTINE_FILE, COUNTRY_CODE, config_json)\n", + "dhis2_routine <- load_routine_data_dataset(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE, fixed_cols_rr)\n", "dim(dhis2_routine)\n", - "head(dhis2_routine, 3)" + "head(dhis2_routine, 3)\n" ] }, { @@ -445,23 +423,8 @@ }, "outputs": [], "source": [ - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\") # reporting rate file\n", - "\n", - "# Load file from dataset\n", - "dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 dataset reporting rates file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "dhis2_reporting <- dhis2_reporting %>% mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric)) # numeric values\n", - "\n", - "msg <- paste0(\"DHIS2 Datatset reporting data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). \n", - "Dataframe dimensions: \", \n", - " paste(dim(dhis2_reporting), collapse=\", \"))\n", - "log_msg(msg)\n", - "head(dhis2_reporting, 3)" + "dhis2_reporting <- load_reporting_data_dataset(config_json, COUNTRY_CODE)\n", + "head(dhis2_reporting, 3)\n" ] }, { @@ -526,16 +489,11 @@ }, "outputs": [], "source": [ - "# Check if REPORTING_RATE_PRODUCT_ID present in the data: if yes, filter to keep only those, else skip filtering (keep all) and log a warning\n", - "if (all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", - " dhis2_reporting <- dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", - " log_msg(glue::glue(\"🪮 Filtering DHIS2 reporting data to keep only values for REPORTING_RATE_PRODUCT_UID(s): {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')}.\n", - " Removed {nrow(dhis2_reporting) - nrow(dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID))} rows.\n", - " Dataframe dimensions after filtering: {paste(dim(dhis2_reporting), collapse=', ')}\"))\n", - "} else {\n", - " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data PRODUCT_UIDs: {paste(unique(dhis2_reporting$PRODUCT_UID), collapse=', ')}. \n", - " 🦘 Skipping filtering and keeping all data. Dataframe dimensions: {paste(dim(dhis2_reporting), collapse=', ')}\"), level = \"warning\")\n", - "}" + "reporting_rate_results <- compute_reporting_rate_dataset(\n", + " dhis2_reporting = dhis2_reporting,\n", + " REPORTING_RATE_PRODUCT_ID = REPORTING_RATE_PRODUCT_ID,\n", + " COUNTRY_CODE = COUNTRY_CODE\n", + ")\n" ] }, { @@ -580,16 +538,7 @@ }, "outputs": [], "source": [ - "# Pivot wider to have one column per PRODUCT_METRIC (which now indicates whether the VALUE is \"ACTUAL_REPORTS\" or \"EXPECTED_REPORTS\")\n", - "dhis2_reporting_wide <- dhis2_reporting %>%\n", - " pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", - "\n", - "# Log msg\n", - "log_msg(glue::glue(\"Pivoted DHIS2 reporting data to wide format, with one column per PRODUCT_METRIC (ACTUAL_REPORTS, EXPECTED_REPORTS).\n", - "Dimensions after pivot: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", - "\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)" + "# Moved to utils for readability.\n" ] }, { @@ -652,25 +601,7 @@ }, "outputs": [], "source": [ - "# Check if any OU_ID is present in more than one PRODUCT_UID\n", - "# and if so list them\n", - "ou_product_counts <- dhis2_reporting %>%\n", - " group_by(OU_ID, OU_NAME) %>%\n", - " mutate(PRODUCT_UID_count = n_distinct(PRODUCT_UID)) %>%\n", - " filter(PRODUCT_UID_count > 1) %>%\n", - " select(ADM1_NAME, ADM2_NAME, OU_ID, OU_NAME, PRODUCT_UID_count) %>%\n", - " distinct() \n", - "\n", - "ou_product_counts\n", - "\n", - "# Log msg: which OU_ID have multiple PRODUCT_UIDs\n", - "if (nrow(ou_product_counts) > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: The following OU_IDs are associated with multiple PRODUCT_UIDs in the DHIS2 reporting data:\n", - "{paste(apply(ou_product_counts, 1, function(row) paste0(' - ', row['OU_NAME'], ' (', row['OU_ID'], ')')), collapse='\\n')}\"), \n", - " level = \"warning\")\n", - "} else {\n", - " log_msg(\"All OU_IDs are associated with a single PRODUCT_UID in the DHIS2 reporting data.\")\n", - "}" + "# Moved to utils for readability.\n" ] }, { @@ -723,20 +654,7 @@ }, "outputs": [], "source": [ - "# Step 1: check for duplicated OU_ID by PERIOD (there should be only 1 value of OU_ID per PERIOD)\n", - "dupl_ou_period <- dhis2_reporting_wide %>%\n", - " group_by(OU_ID, PERIOD) %>%\n", - " filter(n() > 1) %>%\n", - " ungroup() %>%\n", - " select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, ends_with(\"REPORTS\"))\n", - "\n", - "# Log msg\n", - "if (nrow(dupl_ou_period) > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: The OU_IDs are associated with multiple PRODUCT_UIDs affect {nrow(dupl_ou_period)} PERIOD entries (rows) in the DHIS2 reporting data.\"))\n", - "}\n", - "\n", - "dim(dupl_ou_period)\n", - "head(dupl_ou_period, 5)" + "# Moved to utils for readability.\n" ] }, { @@ -764,29 +682,7 @@ }, "outputs": [], "source": [ - "# Step 2: remove duplicated OU_ID by PERIOD\n", - "# Use the following logic:\n", - "# - 1. first, check that values (ACTUAL_REPORTS, EXPECTED_REPORTS) are all 0 or 1 (if not that needs to be handled differently, so skip for now)\n", - "# - 2. then, if multiple PRODUCT_UIDs exist for the same OU_ID and PERIOD, keep the one with the highest ACTUAL_REPORTS value\n", - "# (this is because if values agree, then we can simply keep one, if they don't agree, that means that we have 1 and 0 values, so we keep the 1)\n", - "\n", - "if (all(dupl_ou_period$ACTUAL_REPORTS %in% c(0,1)) & all(dupl_ou_period$EXPECTED_REPORTS %in% c(0,1))) {\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " group_by(PERIOD, OU_ID) %>%\n", - " mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", - " ungroup() %>%\n", - " filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", - " select(-ACTUAL_REPORTS_deduplicated)\n", - "\n", - " log_msg(glue::glue(\"✅ Deduplicated DHIS2 reporting data by keeping only one PRODUCT_UID per OU_ID and PERIOD, based on highest ACTUAL_REPORTS value.\n", - " Dataframe dimensions after deduplication: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", - "} else {\n", - " log_msg(\"🚨 Warning: Cannot deduplicate OU_ID by PERIOD in DHIS2 reporting data because ACTUAL_REPORTS or EXPECTED_REPORTS contain values other than 0 or 1. \n", - " Analysis will continue without removing duplicated entries.\", level = \"warning\")\n", - "} \n", - "\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)" + "# Moved to utils for readability.\n" ] }, { @@ -834,38 +730,7 @@ }, "outputs": [], "source": [ - "# Modify dhis2_reporting_wide to replace all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " log_msg(\"🇳🇪 Special handling for NER: replacing all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1.\")\n", - "\n", - " # Check if any values >1 exist\n", - " n_actual_reports_gt1 <- sum(dhis2_reporting_wide$ACTUAL_REPORTS > 1, na.rm = TRUE)\n", - " n_expected_reports_gt1 <- sum(dhis2_reporting_wide$EXPECTED_REPORTS > 1, na.rm = TRUE)\n", - "\n", - " # Extract the PRODUCT_UID and PRODUCT_NAME associated with those values\n", - " if (n_actual_reports_gt1 > 0 | n_expected_reports_gt1 > 0) {\n", - " dupl_actual_reports <- dhis2_reporting_wide %>%\n", - " filter(ACTUAL_REPORTS > 1) %>%\n", - " select(PRODUCT_UID, PRODUCT_NAME) %>%\n", - " distinct()\n", - "\n", - " log_msg(glue::glue(\"Note: Found {n_actual_reports_gt1} entries with ACTUAL_REPORTS > 1 and {n_expected_reports_gt1} entries with EXPECTED_REPORTS > 1.\n", - "Affected PRODUCT_UIDs and PRODUCT_NAMEs for ACTUAL_REPORTS > 1:\n", - "{paste(apply(dupl_actual_reports, 1, function(row) paste0(row['PRODUCT_NAME'], ' (', row['PRODUCT_UID'], ')')), collapse='\\n')}\"))\n", - "\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " mutate(\n", - " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", - " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", - " )\n", - "\n", - " log_msg(\"✅ Replaced all values of ACTUAL_REPORTS and EXPECTED_REPORTS that were >1 with 1.\")\n", - "\n", - "} # else nothing to replace\n", - "\n", - " dim(dhis2_reporting_wide)\n", - " head(dhis2_reporting_wide, 3)\n", - "}" + "# Moved to utils for readability.\n" ] }, { @@ -910,24 +775,7 @@ }, "outputs": [], "source": [ - "# Sum up values (now at acility level) to get totals per ADM2_ID and PERIOD\n", - "dhis2_reporting_wide_adm2 <- dhis2_reporting_wide %>%\n", - " group_by(\n", - " PERIOD, \n", - " YEAR, MONTH, # keep these just for sanity check (not needed for grouping)\n", - " ADM1_NAME, ADM1_ID, # keep these just for sanity check (not needed for grouping)\n", - " ADM2_NAME, ADM2_ID\n", - " ) %>%\n", - " summarise(\n", - " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", - " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", - " .groups = 'drop'\n", - " ) \n", - "\n", - "# Add log messages\n", - "log_msg(glue::glue(\"DHIS2 reporting data pivoted to wide format and aggregated at ADM2 level. \n", - "Dataframe dimensions: {paste(dim(dhis2_reporting_wide_adm2), collapse=', ')}\"))\n", - "head(dhis2_reporting_wide_adm2, 3)" + "# Moved to utils for readability.\n" ] }, { @@ -974,12 +822,7 @@ }, "outputs": [], "source": [ - "# Calculate REPORTING_RATE as ACTUAL_REPORTS / EXPECTED_REPORTS\n", - "reporting_rate_results <- dhis2_reporting_wide_adm2 %>%\n", - " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", - "\n", - "log_msg(glue::glue(\"DHIS2 reporting rate calculated as ACTUAL_REPORTS / EXPECTED_REPORTS. Dataframe dimensions: {paste(dim(reporting_rate_results), collapse=', ')}\"))\n", - "head(reporting_rate_results, 3) " + "# Moved to utils for readability.\n" ] }, { @@ -1152,17 +995,11 @@ }, "outputs": [], "source": [ - "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", - "\n", - "# parquet\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")) \n", - "write_parquet(reporting_rate_dataset, file_path)\n", - "log_msg(glue(\"Exported : {file_path}\"))\n", - "\n", - "# csv\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\"))\n", - "write.csv(reporting_rate_dataset, file_path, row.names = FALSE)\n", - "log_msg(glue(\"Exported : {file_path}\"))" + "export_reporting_rate_dataset(\n", + " reporting_rate_dataset = reporting_rate_dataset,\n", + " DATA_PATH = DATA_PATH,\n", + " COUNTRY_CODE = COUNTRY_CODE\n", + ")\n" ] } ], diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb index 884b286..90e4762 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb @@ -265,7 +265,12 @@ }, "outputs": [], "source": [ - "break_vals <- jsonlite::fromJSON(metadata_json$REPORTING_RATE$SCALE)\n", + "scale_raw <- metadata_json$REPORTING_RATE$SCALE\n", + "break_vals <- if (is.character(scale_raw) && length(scale_raw) == 1) {\n", + " jsonlite::fromJSON(scale_raw)\n", + "} else {\n", + " as.numeric(unlist(scale_raw, use.names = FALSE))\n", + "}\n", "\n", "log_msg(paste0(\"Reporting Rate scale break values loaded from SNT_metadata.json : \", paste(break_vals, collapse = \", \")))" ] diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r new file mode 100644 index 0000000..285727c --- /dev/null +++ b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r @@ -0,0 +1,109 @@ +select_routine_dataset_name_dataset <- function(ROUTINE_FILE, COUNTRY_CODE, config_json) { + if (ROUTINE_FILE == glue::glue("{COUNTRY_CODE}_routine.parquet")) { + return(config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED) + } + config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION +} + + +load_routine_data_dataset <- function(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE, fixed_cols_rr) { + dhis2_routine <- tryCatch({ + get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) + }, error = function(e) { + msg <- paste("Error while loading DHIS2 routine data file for: ", COUNTRY_CODE, conditionMessage(e)) + cat(msg) + stop(msg) + }) + + dhis2_routine <- dhis2_routine %>% dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric)) + dhis2_routine <- dhis2_routine %>% dplyr::select(dplyr::any_of(fixed_cols_rr)) %>% dplyr::distinct() + + log_msg(glue::glue( + "DHIS2 routine file {ROUTINE_FILE} loaded from dataset : {rountine_dataset_name} dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}" + )) + dhis2_routine +} + + +load_reporting_data_dataset <- function(config_json, COUNTRY_CODE) { + dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED + file_name <- paste0(COUNTRY_CODE, "_reporting.parquet") + + dhis2_reporting <- tryCatch({ + get_latest_dataset_file_in_memory(dataset_name, file_name) + }, error = function(e) { + msg <- paste("[ERROR] Error while loading DHIS2 dataset reporting rates file for: ", COUNTRY_CODE, conditionMessage(e)) + cat(msg) + stop(msg) + }) + dhis2_reporting <- dhis2_reporting %>% dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric)) + + log_msg(paste0( + "DHIS2 Datatset reporting data loaded from file `", file_name, "` (from dataset : `", dataset_name, "`). Dataframe dimensions: ", + paste(dim(dhis2_reporting), collapse = ", ") + )) + dhis2_reporting +} + + +compute_reporting_rate_dataset <- function(dhis2_reporting, REPORTING_RATE_PRODUCT_ID, COUNTRY_CODE) { + if (all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) { + dhis2_reporting <- dhis2_reporting %>% dplyr::filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) + } else { + log_msg(glue::glue( + "🚨 Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data. Skipping filtering." + ), level = "warning") + } + + dhis2_reporting_wide <- dhis2_reporting %>% tidyr::pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE) + + dupl_ou_period <- dhis2_reporting_wide %>% + dplyr::group_by(OU_ID, PERIOD) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::ungroup() %>% + dplyr::select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, dplyr::ends_with("REPORTS")) + + if (all(dupl_ou_period$ACTUAL_REPORTS %in% c(0, 1)) & all(dupl_ou_period$EXPECTED_REPORTS %in% c(0, 1))) { + dhis2_reporting_wide <- dhis2_reporting_wide %>% + dplyr::group_by(PERIOD, OU_ID) %>% + dplyr::mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>% + dplyr::ungroup() %>% + dplyr::filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>% + dplyr::select(-ACTUAL_REPORTS_deduplicated) + } + + if (COUNTRY_CODE == "NER") { + dhis2_reporting_wide <- dhis2_reporting_wide %>% + dplyr::mutate( + ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS), + EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS) + ) + } + + dhis2_reporting_wide_adm2 <- dhis2_reporting_wide %>% + dplyr::group_by(PERIOD, YEAR, MONTH, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID) %>% + dplyr::summarise( + ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE), + EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE), + .groups = "drop" + ) + + dhis2_reporting_wide_adm2 %>% + dplyr::mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS) +} + + +export_reporting_rate_dataset <- function(reporting_rate_dataset, DATA_PATH, COUNTRY_CODE) { + output_data_path <- file.path(DATA_PATH, "reporting_rate") + if (!dir.exists(output_data_path)) { + dir.create(output_data_path, recursive = TRUE) + } + + file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataset.parquet")) + arrow::write_parquet(reporting_rate_dataset, file_path) + log_msg(glue::glue("Exported : {file_path}")) + + file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataset.csv")) + write.csv(reporting_rate_dataset, file_path, row.names = FALSE) + log_msg(glue::glue("Exported : {file_path}")) +} From b3eab9f4899cec214fae8b9f4299feef91b5f8da Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 31 Mar 2026 12:11:42 +0200 Subject: [PATCH 19/23] milestone, everything is working --- .../snt_dhis2_population_transformation.ipynb | 902 ++++--- ...is2_population_transformation_report.ipynb | 174 +- .../snt_dhis2_population_transformation.r | 104 + ...t_dhis2_population_transformation_report.r | 59 + .../code/snt_seasonality_cases.ipynb | 2106 ++++++++-------- .../utils/snt_seasonality_cases.r | 61 + .../code/snt_seasonality_rainfall.ipynb | 2109 ++++++++--------- .../utils/snt_seasonality_rainfall.r | 66 + 8 files changed, 2811 insertions(+), 2770 deletions(-) create mode 100644 pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r create mode 100644 pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r create mode 100644 pipelines/snt_seasonality_rainfall/utils/snt_seasonality_rainfall.r diff --git a/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb b/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb index 3b0ab07..3135161 100644 --- a/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb +++ b/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb @@ -1,480 +1,432 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "96eee97c-fda4-4827-8111-c438cabed82e", - "metadata": {}, - "source": [ - "## Setup start " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1156ab5-2dc6-4bfb-8d7a-ac594c40ecf8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Parameters\n", - "# ADJUST_WITH_UNTOTALS <- TRUE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "687392e7-fe6c-4355-9f4d-6718b467a33d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set project folders\n", - "SNT_ROOT_PATH <- '~/workspace'\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "POPULATION_DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\", \"population_transformed\")\n", - "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\")) # utils\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"glue\", \"reticulate\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "21c802bd-506b-4e60-bd06-b715a5c197ee", - "metadata": {}, - "source": [ - "### Validate parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f4210e2-d68e-4884-9ed1-e36008a6c0e9", - "metadata": {}, - "outputs": [], - "source": [ - "if(!exists(\"ADJUST_WITH_UNTOTALS\")) ADJUST_WITH_UNTOTALS <- FALSE" - ] - }, - { - "cell_type": "markdown", - "id": "c0ea5ae5-5a21-4508-897d-06180e79abbb", - "metadata": {}, - "source": [ - "### Load SNT configuration\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10580e48-ccb5-49df-933d-3cdbc480a402", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "96eee97c-fda4-4827-8111-c438cabed82e", + "metadata": {}, + "source": [ + "## Setup start " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1156ab5-2dc6-4bfb-8d7a-ac594c40ecf8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Parameters\n", + "# ADJUST_WITH_UNTOTALS <- TRUE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "687392e7-fe6c-4355-9f4d-6718b467a33d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set project folders\n", + "SNT_ROOT_PATH <- '~/workspace'\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_population_transformation\")\n", + "POPULATION_DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\", \"population_transformed\")\n", + "\n", + "source(file.path(CODE_PATH, \"snt_utils.r\")) # utils\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_population_transformation.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"glue\", \"reticulate\", \"rlang\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "markdown", + "id": "21c802bd-506b-4e60-bd06-b715a5c197ee", + "metadata": {}, + "source": [ + "### Validate parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f4210e2-d68e-4884-9ed1-e36008a6c0e9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if(!exists(\"ADJUST_WITH_UNTOTALS\")) ADJUST_WITH_UNTOTALS <- FALSE" + ] + }, + { + "cell_type": "markdown", + "id": "c0ea5ae5-5a21-4508-897d-06180e79abbb", + "metadata": {}, + "source": [ + "### Load SNT configuration\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10580e48-ccb5-49df-933d-3cdbc480a402", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# config file path\n", + "config_path <- file.path(CONFIG_PATH, \"SNT_config.json\")\n", + "\n", + "config_json <- tryCatch({ fromJSON(config_path) },\n", + " error = function(e) {\n", + " msg <- glue(\"Error while loading configuration: {config_path}\")\n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "# print(config.json$SNT_CONFIG)\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\"))\n", + "log_msg(msg)\n", + "\n", + "# Save this country code in a variable\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "format_dataset_id <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "n_years_future <- 6 # n_years to the future \n", + "n_years_past <- 6 # n_years to the past " + ] + }, + { + "cell_type": "markdown", + "id": "1f0ce7a3-954b-4711-bd23-268aeb74f1d7", + "metadata": {}, + "source": [ + "### Load DHIS2 population data (formatted dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "77af1690-79c4-4ad4-92c3-47a5dd119906", + "metadata": {}, + "source": [ + "-Load DHIS2 population from latest dataset version \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82d0d5e9-2cc5-4101-9ea6-59aafdcf5b81", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load file from dataset\n", + "dhis2_population <- tryCatch({ \n", + " get_latest_dataset_file_in_memory(format_dataset_id, paste0(COUNTRY_CODE, \"_population.parquet\")) \n", + " }, error = function(e) {\n", + " msg <- paste(\"[ERROR] Error while loading DHIS2 population file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " log_msg(msg, \"error\")\n", + " stop(msg)\n", + "})\n", + "\n", + "msg <- glue(\"DHIS2 population data loaded from dataset : {format_dataset_id} dataframe dimensions: [{paste(dim(dhis2_population), collapse=', ')}]\")\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5dcbe0f6-77b5-42f3-bb49-569fb391070b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dim(dhis2_population)\n", + "head(dhis2_population, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "0d351974-c1c5-4971-97c8-f0122ca9e803", + "metadata": {}, + "source": [ + "## SNT population scaling" + ] + }, + { + "cell_type": "markdown", + "id": "d25b0a44-bdaf-42fe-aa50-c421804906f0", + "metadata": {}, + "source": [ + "Adjust DHIS2 population using UN data (downloaded from WorldPop) as scaling factor (optional).\n", + "If this option is **not selected**, we try adjusting using the total population reference from configuration.\n", + "\n", + "Details: \n", + "- To adjust using population from UN total estimates data, **must** run *B.1 WorldPop Extract pipeline* first!.\n", + "- We assume we have (Wpop) UN ajd population data downloaded only for one year (latest). \n", + "- If the (Wpop) UN adj data is not available, we continue the process and try with total population reference from configuration (if set).\n", + "- The scaled population will be stored in the column \"POPULATION\" (the original POPULATION column will be replaced)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99c541e2-4962-4d7d-b4cf-4a53435ef464", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "total_population_reference <- get_total_population_reference(\n", + " config_json,\n", + " adjust_with_untotals = ADJUST_WITH_UNTOTALS\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "c80c3ef3-c7a6-4845-9134-3900bfba5eef", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "Scale population using total population reference (if available)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04ac7ee1-ce72-4c50-8af5-a6d3d534ec6e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhis2_population <- apply_total_population_scaling(\n", + " dhis2_population = dhis2_population,\n", + " total_population_reference = total_population_reference\n", + ")\n", + "\n", + "head(dhis2_population, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "e0fccb2b-1b4b-4d7b-b879-81d6289a090c", + "metadata": {}, + "source": [ + "## SNT Population projection and back-calculation using a growth factor" + ] + }, + { + "cell_type": "markdown", + "id": "383454f9-b30d-400a-82ef-8cbddb7295e3", + "metadata": {}, + "source": [ + "Apply a growth factor (if defined in the snt config file).\n", + "\n", + "-Projects the population size backward and forward in time (years) using growth rates. \n", + "-For the computation, we consider only one population reference (column) as initial value \"POPULATION\" or \"POPULATION_SCALED\" (is computed in previous steps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21819151-1501-431e-858e-3c3a423bb4ef", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "growth_factor <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[[\"GROWTH_FACTOR\"]]\n", + "reference_year <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[[\"REFERENCE_YEAR\"]]\n", + "\n", + "pop_result <- project_population_with_growth(\n", + " dhis2_population = dhis2_population,\n", + " growth_factor = growth_factor,\n", + " reference_year = reference_year,\n", + " n_years_past = n_years_past,\n", + " n_years_future = n_years_future\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed946301-56b0-4bcd-80c7-acae5f9c186d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check total populations per year\n", + "for (year in sort(unique(pop_result$YEAR))) {\n", + " tot_pop <- sum(pop_result[pop_result$YEAR == year, \"POPULATION\"], na.rm = TRUE)\n", + " print(glue(\"Total population {year} : {tot_pop}\"))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33c45d55-b3ee-400a-b013-23097b7a9d98", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "print(dim(pop_result))\n", + "head(pop_result, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "cd7c7fcc-16b7-4931-a73e-c82604a1d108", + "metadata": {}, + "source": [ + "## SNT Population disaggregations\n", + "\n", + "Any defined disaggregations will be computed from the 'POPULATION_DISAGGREGATIONS' in the configuration file and included as additional columns in the final table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15a820b2-e58f-4859-a0ea-c4b1266ae624", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "pop_disagg <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[[\"POPULATION_DISAGGREGATIONS\"]]\n", + "pop_result <- add_population_disaggregations(pop_result, pop_disagg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2610f36a-5745-4df4-bf5f-7b59c669f099", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "print(dim(pop_result))\n", + "head(pop_result, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "15514c28-37a8-4d7d-896f-0cfe5f157668", + "metadata": {}, + "source": [ + "## Output formatted population data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d704604-8af9-48bf-afc4-3965a634ac75", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "out_msg <- paste0(\"Transfomerd population data saved under: \", file.path(POPULATION_DATA_PATH, paste0(COUNTRY_CODE, \"_population.csv\")))\n", + "\n", + "# write parquet file\n", + "write_parquet(pop_result, file.path(POPULATION_DATA_PATH, paste0(COUNTRY_CODE, \"_population.parquet\")))\n", + "\n", + "# write csv file\n", + "write.csv(pop_result, file.path(POPULATION_DATA_PATH, paste0(COUNTRY_CODE, \"_population.csv\")), row.names = FALSE)\n", + "\n", + "# log\n", + "log_msg(out_msg)" + ] + }, + { + "cell_type": "markdown", + "id": "07ce08f0-526e-48e9-a72d-16aafc1f40b8", + "metadata": {}, + "source": [ + "### Data Summary " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbaa160c-04e4-4b9c-95e4-b320a358ce40", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Data summary\n", + "print(summary(pop_result))" + ] } - }, - "outputs": [], - "source": [ - "# config file path\n", - "config_path <- file.path(CONFIG_PATH, \"SNT_config.json\")\n", - "\n", - "config_json <- tryCatch({ fromJSON(config_path) },\n", - " error = function(e) {\n", - " msg <- glue(\"Error while loading configuration: {config_path}\")\n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "# print(config.json$SNT_CONFIG)\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\"))\n", - "log_msg(msg)\n", - "\n", - "# Save this country code in a variable\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "format_dataset_id <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "n_years_future <- 6 # n_years to the future \n", - "n_years_past <- 6 # n_years to the past " - ] - }, - { - "cell_type": "markdown", - "id": "1f0ce7a3-954b-4711-bd23-268aeb74f1d7", - "metadata": {}, - "source": [ - "### Load DHIS2 population data (formatted dataset)" - ] - }, - { - "cell_type": "markdown", - "id": "77af1690-79c4-4ad4-92c3-47a5dd119906", - "metadata": {}, - "source": [ - "-Load DHIS2 population from latest dataset version \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "82d0d5e9-2cc5-4101-9ea6-59aafdcf5b81", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load file from dataset\n", - "dhis2_population <- tryCatch({ \n", - " get_latest_dataset_file_in_memory(format_dataset_id, paste0(COUNTRY_CODE, \"_population.parquet\")) \n", - " }, error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 population file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " log_msg(msg, \"error\")\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- glue(\"DHIS2 population data loaded from dataset : {format_dataset_id} dataframe dimensions: [{paste(dim(dhis2_population), collapse=', ')}]\")\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5dcbe0f6-77b5-42f3-bb49-569fb391070b", - "metadata": {}, - "outputs": [], - "source": [ - "dim(dhis2_population)\n", - "head(dhis2_population, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "0d351974-c1c5-4971-97c8-f0122ca9e803", - "metadata": {}, - "source": [ - "## SNT population scaling" - ] - }, - { - "cell_type": "markdown", - "id": "d25b0a44-bdaf-42fe-aa50-c421804906f0", - "metadata": {}, - "source": [ - "Adjust DHIS2 population using UN data (downloaded from WorldPop) as scaling factor (optional).\n", - "If this option is **not selected**, we try adjusting using the total population reference from configuration.\n", - "\n", - "Details: \n", - "- To adjust using population from UN total estimates data, **must** run *B.1 WorldPop Extract pipeline* first!.\n", - "- We assume we have (Wpop) UN ajd population data downloaded only for one year (latest). \n", - "- If the (Wpop) UN adj data is not available, we continue the process and try with total population reference from configuration (if set).\n", - "- The scaled population will be stored in the column \"POPULATION\" (the original POPULATION column will be replaced)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "99c541e2-4962-4d7d-b4cf-4a53435ef464", - "metadata": {}, - "outputs": [], - "source": [ - "# default values\n", - "total_population_reference <- NULL\n", - "\n", - "if (ADJUST_WITH_UNTOTALS) {\n", - " \n", - " total_population_reference <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[[\"TOTAL_POPULATION_REF\"]] \n", - " if (is.null(total_population_reference)) {\n", - " log_msg(glue(\"No total population reference found in 'snt_config'. Adjustment will be skipped.\"), \"warning\")\n", - " } else {\n", - " log_msg(glue(\"Using total population reference from SNT configuration file: {total_population_reference}\"))\n", - " }\n", - " } \n" - ] - }, - { - "cell_type": "markdown", - "id": "c80c3ef3-c7a6-4845-9134-3900bfba5eef", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "Scale population using total population reference (if available)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04ac7ee1-ce72-4c50-8af5-a6d3d534ec6e", - "metadata": {}, - "outputs": [], - "source": [ - "if (!is.null(total_population_reference)) {\n", - "\n", - " # Compute totals per DHIS2 year\n", - " year_totals <- dhis2_population %>%\n", - " group_by(YEAR) %>%\n", - " summarise(total_year_pop = sum(POPULATION, na.rm = TRUE))\n", - " \n", - " # Compute scaling factor per year\n", - " year_totals <- year_totals %>%\n", - " mutate(scaling_factor = total_population_reference / total_year_pop)\n", - " \n", - " # Join back and compute adjusted population\n", - " dhis2_population <- dhis2_population %>%\n", - " left_join(year_totals, by = \"YEAR\") %>%\n", - " mutate(POPULATION_SCALED = round(POPULATION * scaling_factor)) %>%\n", - " select(-total_year_pop, -scaling_factor)\n", - " \n", - " for (i in seq_len(nrow(year_totals))) {\n", - " row <- year_totals[i, ]\n", - " dhis2_total = sum(dhis2_population[dhis2_population$YEAR==row$YEAR, \"POPULATION\"], na.rm=TRUE)\n", - " dhis2_total_scd = sum(dhis2_population[dhis2_population$YEAR==row$YEAR, \"POPULATION_SCALED\"], na.rm=TRUE)\n", - " log_msg(glue(\"DHIS2 population year {row$YEAR} ({dhis2_total}) scaled: {dhis2_total_scd} (scaling_factor={round(row$scaling_factor, 3)}).\"))\n", - " } \n", - "\n", - " head(dhis2_population, 3)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e0fccb2b-1b4b-4d7b-b879-81d6289a090c", - "metadata": {}, - "source": [ - "## SNT Population projection and back-calculation using a growth factor" - ] - }, - { - "cell_type": "markdown", - "id": "383454f9-b30d-400a-82ef-8cbddb7295e3", - "metadata": {}, - "source": [ - "Apply a growth factor (if defined in the snt config file).\n", - "\n", - "-Projects the population size backward and forward in time (years) using growth rates. \n", - "-For the computation, we consider only one population reference (column) as initial value \"POPULATION\" or \"POPULATION_SCALED\" (is computed in previous steps)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21819151-1501-431e-858e-3c3a423bb4ef", - "metadata": {}, - "outputs": [], - "source": [ - "# try using growth_factor from config file.\n", - "growth_factor <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[[\"GROWTH_FACTOR\"]]\n", - "reference_year <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[[\"REFERENCE_YEAR\"]]\n", - "\n", - "# which population column to use?\n", - "population_column <- ifelse((\"POPULATION_SCALED\" %in% colnames(dhis2_population)), \"POPULATION_SCALED\", \"POPULATION\") \n", - "columns_selection <- c(\"YEAR\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", population_column) # WE SELECT POPULATION OR (IF EXISTS) POPULATION_SCALED COLUMN (!)\n", - "\n", - "if (!is.null(growth_factor)) {\n", - " \n", - " # Set reference_year to the max year if NULL or not present\n", - " if (is.null(reference_year) || !(reference_year %in% dhis2_population$YEAR)) {\n", - " not_found <- reference_year\n", - " reference_year <- max(dhis2_population$YEAR)\n", - " \n", - " if (!is.null(not_found)) {\n", - " log_msg(\n", - " glue(\"Reference year {not_found} is not present in the population data, using last year: {reference_year}.\"), \n", - " \"warning\"\n", - " )\n", - " }\n", - " }\n", - " \n", - " log_msg(glue(\"Applying growth factor {growth_factor} to project {tolower(population_column)} from reference year {reference_year}.\")) \n", - " projection_years_backward <- seq(reference_year - 1, reference_year - n_years_past, by=-1)\n", - " projection_years_forward <- seq(reference_year + 1, reference_year + n_years_future)\n", - " \n", - " dhis2_population_reference <- dhis2_population[dhis2_population$YEAR == reference_year, columns_selection]\n", - " pop_result <- dhis2_population_reference\n", - " population_forward <- dhis2_population_reference\n", - " population_backward <- dhis2_population_reference\n", - " total_pop_year <- list()\n", - " \n", - " # --- Forward projection ---\n", - " for (year in projection_years_forward) {\n", - " population_forward[[\"YEAR\"]] <- year\n", - " population_forward[[population_column]] <- round(population_forward[[population_column]] * (1 + growth_factor)) \n", - " pop_result <- rbind(pop_result, population_forward) \n", - " }\n", - " \n", - " # --- Backward projection ---\n", - " for (year in projection_years_backward) {\n", - " population_backward[[\"YEAR\"]] <- year\n", - " population_backward[[population_column]] <- round(population_backward[[population_column]] / (1 + growth_factor)) \n", - " pop_result <- rbind(pop_result, population_backward)\n", - " }\n", - " \n", - " pop_result <- pop_result[order(pop_result$YEAR), ] \n", - " \n", - "} else {\n", - " # We need to modify the input to produce a similar table (format)\n", - " pop_result <- dhis2_population[order(dhis2_population$YEAR), columns_selection]\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ed946301-56b0-4bcd-80c7-acae5f9c186d", - "metadata": {}, - "outputs": [], - "source": [ - "# Check total populations per year\n", - "for (year in sort(unique(pop_result$YEAR))) {\n", - " tot_pop <- sum(pop_result[pop_result$YEAR == year, population_column], na.rm=TRUE)\n", - " print(glue(\"Total population {year} : {tot_pop}\"))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "33c45d55-b3ee-400a-b013-23097b7a9d98", - "metadata": {}, - "outputs": [], - "source": [ - "# Rename the output column\n", - "pop_result <- pop_result %>% rename(POPULATION = !!population_column)\n", - "\n", - "print(dim(pop_result))\n", - "head(pop_result, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "cd7c7fcc-16b7-4931-a73e-c82604a1d108", - "metadata": {}, - "source": [ - "## SNT Population disaggregations\n", - "\n", - "Any defined disaggregations will be computed from the 'POPULATION_DISAGGREGATIONS' in the configuration file and included as additional columns in the final table." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15a820b2-e58f-4859-a0ea-c4b1266ae624", - "metadata": {}, - "outputs": [], - "source": [ - "pop_disagg <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[[\"POPULATION_DISAGGREGATIONS\"]]\n", - "\n", - "# Check if the list exists and is not empty\n", - "if (!is.null(pop_disagg) && length(pop_disagg) > 0) {\n", - " \n", - " for (name in names(pop_disagg)) {\n", - " value <- pop_disagg[[name]]\n", - " log_msg(glue::glue(\"Adding disaggregation: {name}, Factor: {value}\"))\n", - " pop_result[[toupper(name)]] <- round(pop_result[[\"POPULATION\"]] * value)\n", - " }\n", - " \n", - "} else {\n", - " message(\"No population disaggregations defined.\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2610f36a-5745-4df4-bf5f-7b59c669f099", - "metadata": {}, - "outputs": [], - "source": [ - "print(dim(pop_result))\n", - "head(pop_result, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "15514c28-37a8-4d7d-896f-0cfe5f157668", - "metadata": {}, - "source": [ - "## Output formatted population data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d704604-8af9-48bf-afc4-3965a634ac75", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "out_msg <- paste0(\"Transfomerd population data saved under: \", file.path(POPULATION_DATA_PATH, paste0(COUNTRY_CODE, \"_population.csv\")))\n", - "\n", - "# write parquet file\n", - "write_parquet(pop_result, file.path(POPULATION_DATA_PATH, paste0(COUNTRY_CODE, \"_population.parquet\")))\n", - "\n", - "# write csv file\n", - "write.csv(pop_result, file.path(POPULATION_DATA_PATH, paste0(COUNTRY_CODE, \"_population.csv\")), row.names = FALSE)\n", - "\n", - "# log\n", - "log_msg(out_msg)" - ] - }, - { - "cell_type": "markdown", - "id": "07ce08f0-526e-48e9-a72d-16aafc1f40b8", - "metadata": {}, - "source": [ - "### Data Summary " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cbaa160c-04e4-4b9c-95e4-b320a358ce40", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "# Data summary\n", - "print(summary(pop_result))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_population_transformation/reporting/snt_dhis2_population_transformation_report.ipynb b/pipelines/snt_dhis2_population_transformation/reporting/snt_dhis2_population_transformation_report.ipynb index 5c21a62..a144e1c 100644 --- a/pipelines/snt_dhis2_population_transformation/reporting/snt_dhis2_population_transformation_report.ipynb +++ b/pipelines/snt_dhis2_population_transformation/reporting/snt_dhis2_population_transformation_report.ipynb @@ -315,12 +315,12 @@ "\n", " # --- Filter data to keep only 2022-2024 ... ---\n", " years_to_keep <- 2022:2024\n", - " population_data_filtered <- population_data |> filter(YEAR %in% years_to_keep)\n", + " population_data_filtered <- population_data |> dplyr::filter(YEAR %in% years_to_keep)\n", "\n", - " # --- Assign population breaks from metadata ---\n", - " value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)\n", - " value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)\n", - " value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)\n", + " # --- Assign population breaks from metadata (supports JSON string or list) ---\n", + " value_breaks_tot <- parse_metadata_scale(metadata_json$POPULATION_TOTAL$SCALE)\n", + " value_breaks_u5 <- parse_metadata_scale(metadata_json$POPULATION_U5$SCALE)\n", + " value_breaks_fe <- parse_metadata_scale(metadata_json$POPULATION_PREGNANT$SCALE)\n", "\n", " # --- Create dynamic labels based on breaks ---\n", " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", @@ -371,50 +371,16 @@ "source": [ "if (COUNTRY_CODE == \"NER\") {\n", "\n", - " # IMPORTNAT: palette vector MUST be RENAMED with the (dynamic) descriptive labels\n", - "names(NER_palette_population) <- labels_tot\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION,\n", - " breaks = c(0, value_breaks_tot, Inf),\n", - " labels = labels_tot, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population totale par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\",\n", - " fill = \"Population Totale:\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_tot, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title.position = \"top\",\n", - " legend.title = element_text(face = \"bold\"),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(20, 0, 0, 0)\n", - " )\n", + "plot <- build_population_choropleth(\n", + " population_data_filtered = population_data_filtered,\n", + " shapes_data = shapes_data,\n", + " population_column = \"POPULATION\",\n", + " breaks_values = value_breaks_tot,\n", + " labels = labels_tot,\n", + " legend_title = \"Population Totale:\",\n", + " plot_title = \"Population totale par district sanitaire (DS)\",\n", + " palette_values = NER_palette_population\n", + ")\n", "\n", "print(plot)\n", "\n", @@ -451,49 +417,16 @@ "source": [ "if (COUNTRY_CODE == \"NER\") {\n", "\n", - "names(NER_palette_population) <- labels_fe\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_FE,\n", - " breaks = c(0, value_breaks_fe, Inf),\n", - " labels = labels_fe, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\",\n", - " fill = \"Population Femmes Enceintes:\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_fe, \n", - " drop = FALSE # Prevents dropping empty levels from legend\n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_text(face = \"bold\"),\n", - " legend.title.position = \"top\",\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(20, 0, 0, 0)\n", - " )\n", + "plot <- build_population_choropleth(\n", + " population_data_filtered = population_data_filtered,\n", + " shapes_data = shapes_data,\n", + " population_column = \"POPULATION_FE\",\n", + " breaks_values = value_breaks_fe,\n", + " labels = labels_fe,\n", + " legend_title = \"Population Femmes Enceintes:\",\n", + " plot_title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", + " palette_values = NER_palette_population\n", + ")\n", "\n", "print(plot)\n", "\n", @@ -502,7 +435,7 @@ " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_transformed_fe.png\")),\n", " create.dir = TRUE,\n", " units = \"cm\",\n", - " width = 21, \n", + " width = 21,\n", " height = 15,\n", " dpi = 300\n", ")\n", @@ -531,49 +464,16 @@ "source": [ "if (COUNTRY_CODE == \"NER\") {\n", "\n", - "names(NER_palette_population) <- labels_u5\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_U5,\n", - " breaks = c(0, value_breaks_u5, Inf),\n", - " labels = labels_u5, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\",\n", - " fill = \"Population Enfants de moins de 5 ans:\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_u5, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_text(face = \"bold\"),\n", - " legend.title.position = \"top\",\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(20, 0, 0, 0)\n", - " )\n", + "plot <- build_population_choropleth(\n", + " population_data_filtered = population_data_filtered,\n", + " shapes_data = shapes_data,\n", + " population_column = \"POPULATION_U5\",\n", + " breaks_values = value_breaks_u5,\n", + " labels = labels_u5,\n", + " legend_title = \"Population Enfants de moins de 5 ans:\",\n", + " plot_title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", + " palette_values = NER_palette_population\n", + ")\n", "\n", "print(plot)\n", "\n", @@ -582,7 +482,7 @@ " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_transformed_u5.png\")),\n", " create.dir = TRUE,\n", " units = \"cm\",\n", - " width = 21, \n", + " width = 21,\n", " height = 15,\n", " dpi = 300\n", ")\n", diff --git a/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r b/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r new file mode 100644 index 0000000..f610951 --- /dev/null +++ b/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r @@ -0,0 +1,104 @@ +get_total_population_reference <- function(config_json, adjust_with_untotals = FALSE) { + if (!adjust_with_untotals) { + return(NULL) + } + + total_population_reference <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[["TOTAL_POPULATION_REF"]] + if (is.null(total_population_reference)) { + log_msg("No total population reference found in 'snt_config'. Adjustment will be skipped.", "warning") + return(NULL) + } + + log_msg(glue::glue("Using total population reference from SNT configuration file: {total_population_reference}")) + total_population_reference +} + + +apply_total_population_scaling <- function(dhis2_population, total_population_reference) { + if (is.null(total_population_reference)) { + return(dhis2_population) + } + + year_totals <- dhis2_population %>% + dplyr::group_by(YEAR) %>% + dplyr::summarise(total_year_pop = sum(POPULATION, na.rm = TRUE), .groups = "drop") %>% + dplyr::mutate(scaling_factor = total_population_reference / total_year_pop) + + dhis2_population_scaled <- dhis2_population %>% + dplyr::left_join(year_totals, by = "YEAR") %>% + dplyr::mutate(POPULATION_SCALED = round(POPULATION * scaling_factor)) %>% + dplyr::select(-total_year_pop, -scaling_factor) + + for (i in seq_len(nrow(year_totals))) { + row <- year_totals[i, ] + dhis2_total <- sum(dhis2_population_scaled[dhis2_population_scaled$YEAR == row$YEAR, "POPULATION"], na.rm = TRUE) + dhis2_total_scd <- sum(dhis2_population_scaled[dhis2_population_scaled$YEAR == row$YEAR, "POPULATION_SCALED"], na.rm = TRUE) + log_msg(glue::glue("DHIS2 population year {row$YEAR} ({dhis2_total}) scaled: {dhis2_total_scd} (scaling_factor={round(row$scaling_factor, 3)}).")) + } + + dhis2_population_scaled +} + + +project_population_with_growth <- function(dhis2_population, growth_factor, reference_year, n_years_past = 6, n_years_future = 6) { + population_column <- ifelse(("POPULATION_SCALED" %in% colnames(dhis2_population)), "POPULATION_SCALED", "POPULATION") + columns_selection <- c("YEAR", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", population_column) + + if (is.null(growth_factor)) { + pop_result <- dhis2_population[order(dhis2_population$YEAR), columns_selection] + pop_result <- pop_result %>% dplyr::rename(POPULATION = !!rlang::sym(population_column)) + return(pop_result) + } + + if (is.null(reference_year) || !(reference_year %in% dhis2_population$YEAR)) { + not_found <- reference_year + reference_year <- max(dhis2_population$YEAR) + + if (!is.null(not_found)) { + log_msg( + glue::glue("Reference year {not_found} is not present in the population data, using last year: {reference_year}."), + "warning" + ) + } + } + + log_msg(glue::glue("Applying growth factor {growth_factor} to project {tolower(population_column)} from reference year {reference_year}.")) + projection_years_backward <- seq(reference_year - 1, reference_year - n_years_past, by = -1) + projection_years_forward <- seq(reference_year + 1, reference_year + n_years_future) + + dhis2_population_reference <- dhis2_population[dhis2_population$YEAR == reference_year, columns_selection] + pop_result <- dhis2_population_reference + population_forward <- dhis2_population_reference + population_backward <- dhis2_population_reference + + for (year in projection_years_forward) { + population_forward[["YEAR"]] <- year + population_forward[[population_column]] <- round(population_forward[[population_column]] * (1 + growth_factor)) + pop_result <- rbind(pop_result, population_forward) + } + + for (year in projection_years_backward) { + population_backward[["YEAR"]] <- year + population_backward[[population_column]] <- round(population_backward[[population_column]] / (1 + growth_factor)) + pop_result <- rbind(pop_result, population_backward) + } + + pop_result <- pop_result[order(pop_result$YEAR), ] + pop_result %>% dplyr::rename(POPULATION = !!rlang::sym(population_column)) +} + + +add_population_disaggregations <- function(pop_result, pop_disagg) { + if (is.null(pop_disagg) || length(pop_disagg) == 0) { + message("No population disaggregations defined.") + return(pop_result) + } + + for (name in names(pop_disagg)) { + value <- pop_disagg[[name]] + log_msg(glue::glue("Adding disaggregation: {name}, Factor: {value}")) + pop_result[[toupper(name)]] <- round(pop_result[["POPULATION"]] * value) + } + + pop_result +} diff --git a/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation_report.r b/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation_report.r index 265a7b1..dc2d4bc 100644 --- a/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation_report.r +++ b/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation_report.r @@ -16,3 +16,62 @@ create_dynamic_labels <- function(breaks) { ) return(labels) } + + +parse_metadata_scale <- function(scale_value) { + if (is.character(scale_value) && length(scale_value) == 1) { + return(jsonlite::fromJSON(scale_value)) + } + as.numeric(unlist(scale_value, use.names = FALSE)) +} + + +build_population_choropleth <- function( + population_data_filtered, + shapes_data, + population_column, + breaks_values, + labels, + legend_title, + plot_title, + palette_values +) { + names(palette_values) <- labels + + population_data_filtered %>% + dplyr::mutate( + CATEGORY_POPULATION = cut( + .data[[population_column]], + breaks = c(0, breaks_values, Inf), + labels = labels, + right = TRUE, + include.lowest = TRUE + ) + ) %>% + dplyr::left_join(shapes_data, by = dplyr::join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% + ggplot2::ggplot() + + ggplot2::geom_sf( + ggplot2::aes(geometry = geometry, fill = CATEGORY_POPULATION), + color = "black", + linewidth = 0.25, + show.legend = TRUE + ) + + ggplot2::labs( + title = plot_title, + subtitle = "Source: NMDR / DHIS2", + fill = legend_title + ) + + ggplot2::scale_fill_manual(values = palette_values, limits = labels, drop = FALSE) + + ggplot2::facet_wrap(~YEAR, ncol = 3) + + ggplot2::theme_void() + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold"), + plot.subtitle = ggplot2::element_text(margin = ggplot2::margin(5, 0, 20, 0)), + legend.position = "bottom", + legend.title = ggplot2::element_text(face = "bold"), + legend.title.position = "top", + strip.text = ggplot2::element_text(face = "bold"), + legend.key.height = grid::unit(0.5, "line"), + legend.margin = ggplot2::margin(20, 0, 0, 0) + ) +} diff --git a/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb b/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb index 669677e..1b8afef 100644 --- a/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb +++ b/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb @@ -1,1106 +1,1058 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "5eebc540-e973-497e-8427-e73d546fdd09", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + "cells": [ + { + "cell_type": "markdown", + "id": "5eebc540-e973-497e-8427-e73d546fdd09", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Case Seasonality Pipeline\n", + "\n", + "This pipeline classifies administrative units (`ADM2`) as **seasonal** or **non-seasonal** based on confirmed malaria case patterns, determines the **duration** of their transmission season, and identifies the respective **onset month**.\n", + "\n", + "### Methodology\n", + "\n", + "The pipeline employs a four-step hierarchical method:\n", + "\n", + "#### 1. Identify \"start\" month\n", + "Evaluates every specific month in the dataset (per district and year) to determine if it marks the beginning of a concentrated transmission period.\n", + "- **Logic**: For each month, compute a forward-looking proportion: the ratio of cases in the next n-month block to the annual total.\n", + "- **Denominator**: Can use either a 12-month forward-looking sliding window (WHO approach) or calendar year (Jan-Dec).\n", + "- **Threshold**: If this proportion exceeds `threshold_for_seasonality` (e.g., 60%), the month is flagged as a valid \"start\" month.\n", + "\n", + "#### 2. Classify ADM2 as \"Seasonal\" or \"Non-seasonal\"\n", + "Aggregates month-level flags to determine if the district consistently experiences a transmission season.\n", + "- **Logic**: Calculate the proportion of years that a specific month was flagged as a \"start\" in Step 1.\n", + "- **Consistency**: If this proportion exceeds `threshold_proportion_seasonal_years` (e.g., 70%), the district is classified as \"Seasonal\".\n", + "\n", + "#### 3. Determine season duration\n", + "Resolves cases where a district qualifies for multiple block durations by selecting the minimum duration.\n", + "- **Output**: `SEASONAL_BLOCK_DURATION_CASES` - the shortest window with the required case proportion.\n", + "\n", + "#### 4. Determine season onset\n", + "Identifies the single official start month using the mode (most frequent value) across years.\n", + "- **Output**: `SEASONAL_BLOCK_START_MONTH_CASES`\n", + "\n", + "---\n", + "\n", + "## Preliminaries" + ] }, - "tags": [] - }, - "source": [ - "## Case Seasonality Pipeline\n", - "\n", - "This pipeline classifies administrative units (`ADM2`) as **seasonal** or **non-seasonal** based on confirmed malaria case patterns, determines the **duration** of their transmission season, and identifies the respective **onset month**.\n", - "\n", - "### Methodology\n", - "\n", - "The pipeline employs a four-step hierarchical method:\n", - "\n", - "#### 1. Identify \"start\" month\n", - "Evaluates every specific month in the dataset (per district and year) to determine if it marks the beginning of a concentrated transmission period.\n", - "- **Logic**: For each month, compute a forward-looking proportion: the ratio of cases in the next n-month block to the annual total.\n", - "- **Denominator**: Can use either a 12-month forward-looking sliding window (WHO approach) or calendar year (Jan-Dec).\n", - "- **Threshold**: If this proportion exceeds `threshold_for_seasonality` (e.g., 60%), the month is flagged as a valid \"start\" month.\n", - "\n", - "#### 2. Classify ADM2 as \"Seasonal\" or \"Non-seasonal\"\n", - "Aggregates month-level flags to determine if the district consistently experiences a transmission season.\n", - "- **Logic**: Calculate the proportion of years that a specific month was flagged as a \"start\" in Step 1.\n", - "- **Consistency**: If this proportion exceeds `threshold_proportion_seasonal_years` (e.g., 70%), the district is classified as \"Seasonal\".\n", - "\n", - "#### 3. Determine season duration\n", - "Resolves cases where a district qualifies for multiple block durations by selecting the minimum duration.\n", - "- **Output**: `SEASONAL_BLOCK_DURATION_CASES` - the shortest window with the required case proportion.\n", - "\n", - "#### 4. Determine season onset\n", - "Identifies the single official start month using the mode (most frequent value) across years.\n", - "- **Output**: `SEASONAL_BLOCK_START_MONTH_CASES`\n", - "\n", - "---\n", - "\n", - "## Preliminaries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "273b05d8-d287-4acc-bd43-5ba6642bd9fa", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# install.packages(\"fpp3\", repos = \"https://cloud.r-project.org\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7789c67", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Clear environment\n", - "rm(list=ls())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18d197b6-9de8-4e4b-bc4e-8b452de67287", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "273b05d8-d287-4acc-bd43-5ba6642bd9fa", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# install.packages(\"fpp3\", repos = \"https://cloud.r-project.org\")" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Global settings\n", - "options(scipen=999)\n", - "\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a6915379-108e-4405-b553-b074aad447d6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "f7789c67", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Clear environment\n", + "rm(list=ls())" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'seasonality_cases')\n", - "INTERMEDIATE_RESULTS_PATH <- file.path(OUTPUT_DATA_PATH, \"intermediate_results\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f94d1e6c-0675-4349-b6e0-a28197c8c9e4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "180b93e7-61af-4981-863f-593b755968bd", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# List required pcks\n", - "required_packages <- c(\n", - " \"jsonlite\",\n", - " \"data.table\",\n", - " \"ggplot2\",\n", - " \"fpp3\",\n", - " \"arrow\",\n", - " \"glue\",\n", - " \"sf\",\n", - " \"RColorBrewer\",\n", - " \"httr\",\n", - " \"reticulate\"\n", - ")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "609cd062-d7d9-42de-976b-10f8a0bfc18a", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "18d197b6-9de8-4e4b-bc4e-8b452de67287", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Global settings\n", + "options(scipen=999)\n", + "\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Check that compute_month_seasonality() supports the required parameter\n", - "if (!(\"use_calendar_year_denominator\" %in% names(formals(compute_month_seasonality)))) {\n", - " error_msg <- paste0(\n", - " \"Error: The function compute_month_seasonality() does not support the parameter 'use_calendar_year_denominator'. \",\n", - " \"Please ensure that the snt_utils.r file is updated to the latest version.\"\n", - " )\n", - " log_msg(error_msg, level = \"error\")\n", - " stop(error_msg)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "458b3d65-cc7e-41bc-95fd-7011dcd5528f", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "a6915379-108e-4405-b553-b074aad447d6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_seasonality_cases')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'seasonality_cases')\n", + "INTERMEDIATE_RESULTS_PATH <- file.path(OUTPUT_DATA_PATH, \"intermediate_results\")" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "print(paste(\"Country code: \", COUNTRY_CODE))" - ] - }, - { - "cell_type": "markdown", - "id": "804a1bd1-26c8-4f6a-af35-3eba64fe0741", - "metadata": {}, - "source": [ - "## Globals and parameters" - ] - }, - { - "cell_type": "markdown", - "id": "414f9ee0-5264-43c4-992f-cff6c719d65c", - "metadata": {}, - "source": [ - "**Parameters**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb82560d-c123-4c54-bfa1-fb5f05e4ad69", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "minimum_periods <- as.integer(36)\n", - "maximum_proportion_missings_overall <- 0.1\n", - "maximum_proportion_missings_per_district <- 0.2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9024226d-5845-48a0-8ae4-e7b5a8d11988", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "f94d1e6c-0675-4349-b6e0-a28197c8c9e4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_seasonality_cases.r\"))" + ] }, - "tags": [ - "parameters" - ], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Fallback parameter values for local/dev execution\n", - "# When run via pipeline, these are injected by Papermill in the first cell\n", - "if (!exists(\"minimum_month_block_size\")) {\n", - " minimum_month_block_size <- as.integer(3)\n", - "}\n", - "if (!exists(\"maximum_month_block_size\")) {\n", - " maximum_month_block_size <- as.integer(5)\n", - "}\n", - "if (!exists(\"threshold_for_seasonality\")) {\n", - " threshold_for_seasonality <- 0.6\n", - "}\n", - "if (!exists(\"threshold_proportion_seasonal_years\")) {\n", - " threshold_proportion_seasonal_years <- 0.5\n", - "}\n", - "if (!exists(\"use_calendar_year_denominator\")) {\n", - " use_calendar_year_denominator <- FALSE\n", - "}\n", - "\n", - "# Ensure correct types\n", - "minimum_month_block_size <- as.integer(minimum_month_block_size)\n", - "maximum_month_block_size <- as.integer(maximum_month_block_size)\n", - "\n", - "# Log parameter values\n", - "log_msg(paste(\"Minimum month block size:\", minimum_month_block_size))\n", - "log_msg(paste(\"Maximum month block size:\", maximum_month_block_size))\n", - "log_msg(paste(\"Threshold for seasonality:\", threshold_for_seasonality))\n", - "log_msg(paste(\"Threshold proportion seasonal years:\", threshold_proportion_seasonal_years))\n", - "log_msg(paste(\"Use calendar year denominator:\", use_calendar_year_denominator))" - ] - }, - { - "cell_type": "markdown", - "id": "0b2d3bb6-6351-4f32-92de-44a6579b6630", - "metadata": {}, - "source": [ - "**Fixed routine formatting columns**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90b27881-b25d-4cb3-8b2f-4dd1b395bdee", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "180b93e7-61af-4981-863f-593b755968bd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List required pcks\n", + "required_packages <- c(\n", + " \"jsonlite\",\n", + " \"data.table\",\n", + " \"ggplot2\",\n", + " \"fpp3\",\n", + " \"arrow\",\n", + " \"glue\",\n", + " \"sf\",\n", + " \"RColorBrewer\",\n", + " \"httr\",\n", + " \"reticulate\"\n", + ")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Global variables\n", - "type_of_seasonality <- \"cases\"\n", - "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", - "data_source <- \"DHIS2\"\n", - "original_values_col <- \"CONF\"\n", - "\n", - "# space and time columns\n", - "admin_level <- 'ADM2'\n", - "admin_id_col <- paste(admin_level, toupper('id'), sep = '_')\n", - "admin_name_col <- paste(admin_level, toupper('name'), sep = '_')\n", - "year_col <- 'YEAR'\n", - "month_col <- 'MONTH'\n", - "period_cols <- c(year_col, month_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "473308f4-4630-4d9e-82a9-b2b4fc9134db", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "609cd062-d7d9-42de-976b-10f8a0bfc18a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Check that compute_month_seasonality() supports the required parameter\n", + "if (!(\"use_calendar_year_denominator\" %in% names(formals(compute_month_seasonality)))) {\n", + " error_msg <- paste0(\n", + " \"Error: The function compute_month_seasonality() does not support the parameter 'use_calendar_year_denominator'. \",\n", + " \"Please ensure that the snt_utils.r file is updated to the latest version.\"\n", + " )\n", + " log_msg(error_msg, level = \"error\")\n", + " stop(error_msg)\n", + "}" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "possible_month_block_sizes <- as.integer(minimum_month_block_size:maximum_month_block_size)\n", - "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", - "print(paste(\"Formatted threshold :\",formatted_threshold_for_seasonality))" - ] - }, - { - "cell_type": "markdown", - "id": "86f492f3-5634-4987-a2b8-23014aba5d51", - "metadata": {}, - "source": [ - "## Load data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "623480ee-4310-4ead-a8c8-bf294527c814", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f766ea1-dced-4143-a5be-fdc51da4bd8d", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "458b3d65-cc7e-41bc-95fd-7011dcd5528f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "print(paste(\"Country code: \", COUNTRY_CODE))" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load routine data from dataset\n", - "case_data_filename <- paste(COUNTRY_CODE, \"routine.parquet\", sep = \"_\")\n", - "original_dt <- get_latest_dataset_file_in_memory(dhis2_dataset, case_data_filename)\n", - "log_msg(glue(\"File {case_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b769deb-52e5-471d-9950-ac431dd8cf03", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Columns formatting\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)\n", - "common_cols <- names(admin_data)\n", - "\n", - "seasonality_col <- glue('SEASONALITY', toupper(type_of_seasonality), .sep = \"_\")\n", - "season_duration_col <- glue('SEASONAL_BLOCK_DURATION', toupper(type_of_seasonality), .sep = \"_\")\n", - "season_start_month_col <- glue('SEASONAL_BLOCK_START_MONTH', toupper(type_of_seasonality), .sep = \"_\")\n", - "cases_proportion_col <- 'CASES_PROPORTION'\n", - "final_table_cols <- c(names(admin_data), seasonality_col, season_duration_col, season_start_month_col, cases_proportion_col)\n", - "print(final_table_cols)" - ] - }, - { - "cell_type": "markdown", - "id": "0d329af2-f544-4ee2-940f-65e2ab11c49d", - "metadata": {}, - "source": [ - "**Create the containers for the data**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90486c1e-38bc-4c6f-bffe-b7e8f3be68ca", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create an empty table if the analysis is stopped for lack of enough data\n", - "seasonality_cols <- c(seasonality_col, season_duration_col, season_start_month_col, cases_proportion_col)\n", - "empty_dt <- copy(admin_data)[, (seasonality_cols) := NA]" - ] - }, - { - "cell_type": "markdown", - "id": "b8da71be-45f1-405c-857c-ed86984988f4", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "804a1bd1-26c8-4f6a-af35-3eba64fe0741", + "metadata": {}, + "source": [ + "## Globals and parameters" + ] }, - "tags": [] - }, - "source": [ - "## Preprocess input data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5bf0faa-357e-44a7-af0c-04dd382af7e0", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "414f9ee0-5264-43c4-992f-cff6c719d65c", + "metadata": {}, + "source": [ + "**Parameters**" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# format table\n", - "setDT(original_dt)\n", - "integer_cols <- c(year_col, month_col)\n", - "numeric_cols <- c(original_values_col)\n", - "original_dt[, (integer_cols) := lapply(.SD, as.integer), .SDcols = integer_cols]\n", - "# head(original_dt)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1a762ad-943e-467b-8cc1-e4998a996b9f", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "fb82560d-c123-4c54-bfa1-fb5f05e4ad69", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "minimum_periods <- as.integer(36)\n", + "maximum_proportion_missings_overall <- 0.1\n", + "maximum_proportion_missings_per_district <- 0.2" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# keep only the useful columns and aggregate the data on them\n", - "original_dt <- original_dt[,\n", - " setNames(list(sum(get(original_values_col), na.rm = TRUE)), original_values_col), \n", - " by = c(admin_id_col, period_cols)\n", - " ]\n", - "\n", - "num_periods <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[1]]\n", - "all_rows <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[2]]\n", - "\n", - "if (num_periods < minimum_periods){ \n", - " log_msg(glue(\"Data is not reliable: \n", - " at least {minimum_periods} year-month periods of data are required for the case analyais; \n", - " the data only contains {num_periods} periods. Abandoning analysis.\")\n", - " , level=\"error\")\n", - " stop(\"ERROR 1\")\n", - "}\n", - "\n", - "# inject the (possibly missing) rows into the data\n", - "original_dt <- make_full_time_space_data(\n", - " input_dt=original_dt,\n", - " full_rows_dt=all_rows,\n", - " target_colname=original_values_col,\n", - " admin_colname=admin_id_col,\n", - " year_colname=year_col,\n", - " month_colname=month_col)\n", - "\n", - "if(nrow(original_dt[is.na(get(original_values_col)),]) > (maximum_proportion_missings_overall * nrow(original_dt))){ \n", - " log_msg(\"There are too many missing values in the data overall. Abandoning analysis.\", level=\"error\")\n", - " stop(\"ERROR 2\") \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e3d793a5-ac96-4dcc-bd86-5837a631ea54", - "metadata": {}, - "source": [ - "### Imputation of missings" - ] - }, - { - "cell_type": "markdown", - "id": "1b7c767b-5343-4d7a-ad6a-aac11ee2ba12", - "metadata": {}, - "source": [ - "**Remove impute files (if any)**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac3414c6-baf1-47f0-ad6d-5ff1cb0e432e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Remove existing imputation files\n", - "filename_imputed_dt <- paste(COUNTRY_CODE, type_of_seasonality, 'imputed.csv', sep = '_')\n", - "files_in_folder <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "files_to_remove <- files_in_folder[grepl(filename_imputed_dt, basename(files_in_folder), ignore.case = TRUE)]\n", - "file.remove(files_to_remove)\n", - "print(glue(\"Deleted files: {str(files_to_remove)}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8cf34a4f-f429-4ee5-9919-5e5c7abe9da6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "9024226d-5845-48a0-8ae4-e7b5a8d11988", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "parameters" + ], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Fallback parameter values for local/dev execution\n", + "# When run via pipeline, these are injected by Papermill in the first cell\n", + "if (!exists(\"minimum_month_block_size\")) {\n", + " minimum_month_block_size <- as.integer(3)\n", + "}\n", + "if (!exists(\"maximum_month_block_size\")) {\n", + " maximum_month_block_size <- as.integer(5)\n", + "}\n", + "if (!exists(\"threshold_for_seasonality\")) {\n", + " threshold_for_seasonality <- 0.6\n", + "}\n", + "if (!exists(\"threshold_proportion_seasonal_years\")) {\n", + " threshold_proportion_seasonal_years <- 0.5\n", + "}\n", + "if (!exists(\"use_calendar_year_denominator\")) {\n", + " use_calendar_year_denominator <- FALSE\n", + "}\n", + "\n", + "# Ensure correct types\n", + "minimum_month_block_size <- as.integer(minimum_month_block_size)\n", + "maximum_month_block_size <- as.integer(maximum_month_block_size)\n", + "\n", + "# Log parameter values\n", + "log_msg(paste(\"Minimum month block size:\", minimum_month_block_size))\n", + "log_msg(paste(\"Maximum month block size:\", maximum_month_block_size))\n", + "log_msg(paste(\"Threshold for seasonality:\", threshold_for_seasonality))\n", + "log_msg(paste(\"Threshold proportion seasonal years:\", threshold_proportion_seasonal_years))\n", + "log_msg(paste(\"Use calendar year denominator:\", use_calendar_year_denominator))" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# create the name of the column which will store the imputed/estimated values\n", - "imputed_col = paste(original_values_col, 'EST', sep = '_')\n", - "\n", - "# if there are rows of missing data for cases, impute them (SARIMA)\n", - "if(nrow(original_dt[!is.na(get(original_values_col)),]) != nrow(original_dt)) {\n", - " log_msg(\"There is missing data. Proceeding to impute them.\", level=\"warning\")\n", - " \n", - " # extract data on only the administrative units which have missing values for original_values_col\n", - " missing_dt <- extract_dt_with_missings(original_dt, target_colname = original_values_col, id_colname = admin_id_col)\n", - " missing_dt <- missing_dt[, PERIOD := make_yearmonth(year = YEAR, month = MONTH)]\n", - " missing_dt <- missing_dt[, .SD, .SDcols = c(admin_id_col, 'PERIOD', original_values_col)]\n", - " \n", - " # how many rows missing for each administrative unit? if too many, then not good idea to impute\n", - " missings_by_admin_unit <- missing_dt[, .(missing_count = sum(is.na(get(original_values_col)))), by = admin_id_col][order(-missing_count)]\n", - " \n", - " # if for any given admin unit, more than a given % of data is missing, there's too much to impute (maybe should be stricter - to discuss)\n", - " if(missings_by_admin_unit[, max(missing_count)] > maximum_proportion_missings_per_district * num_periods){\n", - " log_msg(\"Some administrative units have too many missing values in the target data. Abandoning analysis.\", level=\"error\")\n", - " stop(\"ERROR 3\")\n", - " }\n", - " \n", - " # split to list per admin_unit_id, to apply SARIMA imputation on each time series (per admin unit)\n", - " missing_districts_list <- split(missing_dt, by = admin_id_col)\n", - " \n", - " # seasonal ARIMA to estimate missing cases: apply function to list of data.tables with missing rows, then create data.table from result\n", - " filled_missings_dt <- rbindlist(\n", - " lapply(missing_districts_list,\n", - " fill_missing_cases_ts,\n", - " original_values_colname=original_values_col,\n", - " estimated_values_colname=imputed_col,\n", - " admin_colname=admin_id_col,\n", - " period_colname='PERIOD',\n", - " threshold_for_missing = 0.0)\n", - " )\n", - " \n", - " # add the imputed (\"_EST\") values to the original data\n", - " imputed_dt <- merge.data.table(original_dt, filled_missings_dt[, .SD, .SDcols = !(original_values_col)], by = c(admin_id_col, year_col, month_col), all.x = TRUE)\n", - " \n", - " # copy from the districts without missings;\n", - " # if data is large, this could be made faster by only copying from the districts which are not in the missing_dt\n", - " imputed_dt[!is.na(get(original_values_col)), (imputed_col) := get(original_values_col)]\n", - "\n", - " # Save imputed file, only if it was computed (if there is missing data to impute)\n", - " safe_create_dir(INTERMEDIATE_RESULTS_PATH)\n", - " fwrite(imputed_dt, file = file.path(INTERMEDIATE_RESULTS_PATH, filename_imputed_dt))\n", - " \n", - "} else {\n", - " imputed_dt <- copy(original_dt)\n", - " imputed_dt[, (imputed_col) := get(original_values_col)]\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9db44942-d844-491c-9045-906e99a37c60", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "0b2d3bb6-6351-4f32-92de-44a6579b6630", + "metadata": {}, + "source": [ + "**Fixed routine formatting columns**" + ] }, - "tags": [] - }, - "source": [ - "## Seasonality" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9bfa3ebf-3a04-405a-9cc6-5f174a08f70b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "90b27881-b25d-4cb3-8b2f-4dd1b395bdee", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Global variables\n", + "type_of_seasonality <- \"cases\"\n", + "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", + "data_source <- \"DHIS2\"\n", + "original_values_col <- \"CONF\"\n", + "\n", + "# space and time columns\n", + "admin_level <- 'ADM2'\n", + "admin_id_col <- paste(admin_level, toupper('id'), sep = '_')\n", + "admin_name_col <- paste(admin_level, toupper('name'), sep = '_')\n", + "year_col <- 'YEAR'\n", + "month_col <- 'MONTH'\n", + "period_cols <- c(year_col, month_col)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: Compute month-level seasonality indicators\n", - "# For each row (period-admin unit), determine if it marks the start of a seasonal block\n", - "\n", - "row_seasonality_dt <- compute_month_seasonality(\n", - " input_dt=imputed_dt,\n", - " indicator=type_of_seasonality,\n", - " values_colname=imputed_col,\n", - " vector_of_durations=possible_month_block_sizes,\n", - " admin_colname=admin_id_col,\n", - " year_colname=year_col,\n", - " month_colname=month_col,\n", - " proportion_threshold=threshold_for_seasonality,\n", - " use_calendar_year_denominator=use_calendar_year_denominator\n", - ")\n", - "\n", - "# Create the filename\n", - "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'row_seasonality', sep = '_')\n", - "filename_csv = glue(\"{file_stem}.csv\")\n", - "filename_parquet = glue(\"{file_stem}.parquet\")\n", - "fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", - "write_parquet(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", - "\n", - "\n", - "# The seasonality per admin unit, irrespective of year ----------------------\n", - "\n", - "seasonality_source_dt <- process_seasonality(\n", - " input_dt=row_seasonality_dt,\n", - " indicator=type_of_seasonality,\n", - " vector_of_durations=possible_month_block_sizes,\n", - " admin_colname=admin_id_col,\n", - " year_colname=year_col,\n", - " month_colname=month_col,\n", - " proportion_seasonal_years_threshold=threshold_proportion_seasonal_years\n", - ")\n", - "\n", - "# Compute the duration block; there are normal warnings when it's only 0-es for seasonality:\n", - "# for those admin units without any seasonality, the duration of the block will be 'infinite')\n", - "check_pattern_seasonality <- paste(\"^SEASONALITY\", toupper(type_of_seasonality), \"[0-9]+_MTH$\", sep = \"_\")\n", - "seasonality_source_dt <- seasonality_source_dt[, .SD, .SDcols = c(admin_id_col, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]" - ] - }, - { - "cell_type": "markdown", - "id": "d9f8270f-7283-4630-b9ba-62366b1c3e62", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "473308f4-4630-4d9e-82a9-b2b4fc9134db", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "possible_month_block_sizes <- as.integer(minimum_month_block_size:maximum_month_block_size)\n", + "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", + "print(paste(\"Formatted threshold :\",formatted_threshold_for_seasonality))" + ] }, - "tags": [] - }, - "source": [ - "## Result file" - ] - }, - { - "cell_type": "markdown", - "id": "477fb459-0f98-4a32-96ab-f10b4395495f", - "metadata": {}, - "source": [ - "### long" - ] - }, - { - "cell_type": "markdown", - "id": "db719aed-6347-48f4-8984-add9f8adec2d", - "metadata": {}, - "source": [ - "This format, until further notice, is not saved." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50ba68f5-e9a9-4144-99dc-de8cc44770e9", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "86f492f3-5634-4987-a2b8-23014aba5d51", + "metadata": {}, + "source": [ + "## Load data" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "seasonality_long_dt <- melt(\n", - " seasonality_source_dt,\n", - " id.vars = grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE, invert = TRUE), # all cols which don't follow the pattern\n", - " variable.name = 'MONTH_BLOCK_SIZE',\n", - " value.name =seasonality_col\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b2a400d4-a545-40ab-9204-b6b2e69ea499", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "623480ee-4310-4ead-a8c8-bf294527c814", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load spatial file from dataset\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "seasonality_long_dt[, MONTH_BLOCK_SIZE := possible_month_block_sizes[match(MONTH_BLOCK_SIZE, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]]\n", - "\n", - "# add remaining admin unit columns and save the final results\n", - "admin_seasonality_long_dt <- merge.data.table(admin_data, seasonality_long_dt, by = c(admin_id_col), all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a7926d5-d4e7-4707-85b8-7020c3738be3", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# order the columns\n", - "specific_cols <- setdiff(names(admin_seasonality_long_dt), names(admin_data)) # last columns\n", - "admin_seasonality_long_dt <- admin_seasonality_long_dt[, .SD, .SDcols = c(common_cols, specific_cols)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "343a07c2-2e0e-49eb-b487-cc8d6431e015", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "1f766ea1-dced-4143-a5be-fdc51da4bd8d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load routine data from dataset\n", + "case_data_filename <- paste(COUNTRY_CODE, \"routine.parquet\", sep = \"_\")\n", + "original_dt <- get_latest_dataset_file_in_memory(dhis2_dataset, case_data_filename)\n", + "log_msg(glue(\"File {case_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Keeping for now.\n", - "# # filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", - "# filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", - "# fwrite(admin_seasonality_long_dt, file.path(OUTPUT_DATA_PATH, filename_admin_seasonality_long_dt))" - ] - }, - { - "cell_type": "markdown", - "id": "36d1f9cb-75b6-4f6a-a18c-b34eb233b8d2", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "7b769deb-52e5-471d-9950-ac431dd8cf03", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Columns formatting\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)\n", + "common_cols <- names(admin_data)\n", + "\n", + "seasonality_col <- glue('SEASONALITY', toupper(type_of_seasonality), .sep = \"_\")\n", + "season_duration_col <- glue('SEASONAL_BLOCK_DURATION', toupper(type_of_seasonality), .sep = \"_\")\n", + "season_start_month_col <- glue('SEASONAL_BLOCK_START_MONTH', toupper(type_of_seasonality), .sep = \"_\")\n", + "cases_proportion_col <- 'CASES_PROPORTION'\n", + "final_table_cols <- c(names(admin_data), seasonality_col, season_duration_col, season_start_month_col, cases_proportion_col)\n", + "print(final_table_cols)" + ] }, - "tags": [] - }, - "source": [ - "### Transform to wide format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e71259c9-29a6-452f-8949-74adb0e62c1c", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "0d329af2-f544-4ee2-940f-65e2ab11c49d", + "metadata": {}, + "source": [ + "**Create the containers for the data**" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "seasonality_wide_dt <- compute_min_seasonality_block(\n", - " input_dt=seasonality_source_dt,\n", - " seasonality_column_pattern=check_pattern_seasonality,\n", - " vector_of_possible_month_block_sizes=possible_month_block_sizes,\n", - " # indicator=toupper(type_of_seasonality),\n", - " seasonal_blocksize_colname=season_duration_col,\n", - " valid_value = 1\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9284c723-97d4-46f9-8837-7f70dae92a31", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "90486c1e-38bc-4c6f-bffe-b7e8f3be68ca", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create an empty table if the analysis is stopped for lack of enough data\n", + "seasonality_cols <- c(seasonality_col, season_duration_col, season_start_month_col, cases_proportion_col)\n", + "empty_dt <- copy(admin_data)[, (seasonality_cols) := NA]" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create a new, overall column 'SEASONALITY_' based on the values of columns in 'check_pattern_seasonality'\n", - "seasonality_pattern_cols <- grep(check_pattern_seasonality, names(seasonality_wide_dt), value = TRUE)\n", - "if (length(seasonality_pattern_cols) > 0L) {\n", - " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_col) := ifelse(rowSums(.SD == 1, na.rm = TRUE) > 0, 1L, 0L), .SDcols = seasonality_pattern_cols]\n", - " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_pattern_cols) := NULL]\n", - "} else {\n", - " seasonality_wide_dt[, (seasonality_col) := NA_integer_]\n", - "}\n", - "\n", - "# Compute CASES_PROPORTION: proportion of cases in the seasonal block vs ANNUAL total\n", - "# Only for seasonal admin units (SEASONALITY_CASES = 1)\n", - "\n", - "# Step 1: Compute annual totals per admin-year from imputed data\n", - "annual_totals_dt <- imputed_dt[, .(ANNUAL_TOTAL = sum(get(imputed_col), na.rm = TRUE)), by = c(admin_id_col, year_col)]\n", - "\n", - "# Step 2: Function to compute proportion = max block sum / annual total\n", - "compute_cases_proportion <- function(admin_id, block_duration, row_data, annual_data, admin_col, year_column) {\n", - " if (is.na(block_duration) || is.infinite(block_duration)) return(NA_real_)\n", - " \n", - " # Column with block sum (N-month forward-looking sum)\n", - " sum_col <- paste('CASES_SUM', block_duration, 'MTH_FW', sep = '_')\n", - " if (!sum_col %in% names(row_data)) return(NA_real_)\n", - " \n", - " admin_row_data <- row_data[get(admin_col) == admin_id]\n", - " admin_annual_data <- annual_data[get(admin_col) == admin_id]\n", - " if (nrow(admin_row_data) == 0 || nrow(admin_annual_data) == 0) return(NA_real_)\n", - " \n", - " # For each year, get max block sum (only if there are non-NA values)\n", - " yearly_max_block <- admin_row_data[\n", - " !is.na(get(sum_col)),\n", - " .(max_block_sum = if (.N > 0L) max(get(sum_col), na.rm = TRUE) else NA_real_),\n", - " by = year_column\n", - " ]\n", - " \n", - " # Remove rows with NA or -Inf (from max when all values were NA)\n", - " yearly_max_block <- yearly_max_block[is.finite(max_block_sum)]\n", - " if (nrow(yearly_max_block) == 0) return(NA_real_)\n", - " \n", - " # Merge with annual totals\n", - " merged <- merge(yearly_max_block, admin_annual_data, by = year_column)\n", - " merged <- merged[ANNUAL_TOTAL > 0]\n", - " if (nrow(merged) == 0) return(NA_real_)\n", - " \n", - " # Proportion = block sum / annual total, then average across years\n", - " merged[, prop := max_block_sum / ANNUAL_TOTAL]\n", - " return(mean(merged$prop, na.rm = TRUE))\n", - "}\n", - "\n", - "seasonality_wide_dt[, (cases_proportion_col) := mapply(\n", - " compute_cases_proportion,\n", - " admin_id = get(admin_id_col),\n", - " block_duration = get(season_duration_col),\n", - " MoreArgs = list(row_data = row_seasonality_dt, annual_data = annual_totals_dt, admin_col = admin_id_col, year_column = year_col)\n", - ")]\n", - "\n", - "# Set CASES_PROPORTION to NA for non-seasonal admin units\n", - "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (cases_proportion_col) := NA_real_]\n", - "\n", - "# Compute SEASONAL_BLOCK_START_MONTH: first month of the seasonal block\n", - "# Only for seasonal admin units (SEASONALITY_CASES = 1)\n", - "\n", - "# Function to find the most frequent starting month for a given admin unit and block duration\n", - "compute_start_month <- function(admin_id, block_duration, row_data, admin_col, year_column, month_column) {\n", - " if (is.na(block_duration) || is.infinite(block_duration)) return(NA_integer_)\n", - " \n", - " # Column with row-level seasonality indicator for this block duration\n", - " seasonality_row_col <- paste('CASES', block_duration, 'MTH_ROW_SEASONALITY', sep = '_')\n", - " if (!seasonality_row_col %in% names(row_data)) return(NA_integer_)\n", - " \n", - " admin_row_data <- row_data[get(admin_col) == admin_id]\n", - " if (nrow(admin_row_data) == 0) return(NA_integer_)\n", - " \n", - " # Filter rows where seasonality = 1 (this month is the start of a seasonal block)\n", - " seasonal_months <- admin_row_data[get(seasonality_row_col) == 1, get(month_column)]\n", - " \n", - " if (length(seasonal_months) == 0) return(NA_integer_)\n", - " \n", - " # Find the most frequent month (mode)\n", - " month_counts <- table(seasonal_months)\n", - " most_frequent_month <- as.integer(names(month_counts)[which.max(month_counts)])\n", - " \n", - " return(most_frequent_month)\n", - "}\n", - "\n", - "seasonality_wide_dt[, (season_start_month_col) := mapply(\n", - " compute_start_month,\n", - " admin_id = get(admin_id_col),\n", - " block_duration = get(season_duration_col),\n", - " MoreArgs = list(row_data = row_seasonality_dt, admin_col = admin_id_col, year_column = year_col, month_column = month_col)\n", - ")]\n", - "\n", - "# Set SEASONAL_BLOCK_START_MONTH to NA for non-seasonal admin units\n", - "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (season_start_month_col) := NA_integer_]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5ea18ef-1148-432d-a954-b9c6b4a06afc", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "b8da71be-45f1-405c-857c-ed86984988f4", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Preprocess input data" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# add remaining admin unit columns and save the final results\n", - "admin_seasonality_wide_dt <- merge.data.table(admin_data, seasonality_wide_dt, by = c(admin_id_col), all = TRUE)\n", - "admin_seasonality_wide_dt <- admin_seasonality_wide_dt[, .SD, .SDcols = c(common_cols, seasonality_cols)]\n", - "# head(admin_seasonality_wide_dt)" - ] - }, - { - "cell_type": "markdown", - "id": "a2f1a373-4b34-42db-b591-b25c7050dee6", - "metadata": {}, - "source": [ - "**Save output**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ded1e86d-f4c5-4fdb-a6ba-611d5c7f4aed", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "c5bf0faa-357e-44a7-af0c-04dd382af7e0", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# format table\n", + "setDT(original_dt)\n", + "integer_cols <- c(year_col, month_col)\n", + "numeric_cols <- c(original_values_col)\n", + "original_dt[, (integer_cols) := lapply(.SD, as.integer), .SDcols = integer_cols]\n", + "# head(original_dt)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create the filename\n", - "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'seasonality', sep = '_')\n", - "filename_csv = glue(\"{file_stem}.csv\")\n", - "filename_parquet = glue(\"{file_stem}.parquet\")\n", - "fwrite(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", - "write_parquet(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", - "log_msg(paste0(\"Case seasonality results saved in folder \", OUTPUT_DATA_PATH))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9533db1c-a9db-42be-a4cd-4076d9f212ba", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, \"row_seasonality.csv\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b39e53f-f4fe-49e4-8e4d-9a6679313a54", - "metadata": { - "vscode": { - "languageId": "r" + { + "cell_type": "code", + "execution_count": null, + "id": "a1a762ad-943e-467b-8cc1-e4998a996b9f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# keep only the useful columns and aggregate the data on them\n", + "original_dt <- original_dt[,\n", + " setNames(list(sum(get(original_values_col), na.rm = TRUE)), original_values_col), \n", + " by = c(admin_id_col, period_cols)\n", + " ]\n", + "\n", + "num_periods <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[1]]\n", + "all_rows <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[2]]\n", + "\n", + "if (num_periods < minimum_periods){ \n", + " log_msg(glue(\"Data is not reliable: \n", + " at least {minimum_periods} year-month periods of data are required for the case analyais; \n", + " the data only contains {num_periods} periods. Abandoning analysis.\")\n", + " , level=\"error\")\n", + " stop(\"ERROR 1\")\n", + "}\n", + "\n", + "# inject the (possibly missing) rows into the data\n", + "original_dt <- make_full_time_space_data(\n", + " input_dt=original_dt,\n", + " full_rows_dt=all_rows,\n", + " target_colname=original_values_col,\n", + " admin_colname=admin_id_col,\n", + " year_colname=year_col,\n", + " month_colname=month_col)\n", + "\n", + "if(nrow(original_dt[is.na(get(original_values_col)),]) > (maximum_proportion_missings_overall * nrow(original_dt))){ \n", + " log_msg(\"There are too many missing values in the data overall. Abandoning analysis.\", level=\"error\")\n", + " stop(\"ERROR 2\") \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e3d793a5-ac96-4dcc-bd86-5837a631ea54", + "metadata": {}, + "source": [ + "### Imputation of missings" + ] + }, + { + "cell_type": "markdown", + "id": "1b7c767b-5343-4d7a-ad6a-aac11ee2ba12", + "metadata": {}, + "source": [ + "**Remove impute files (if any)**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac3414c6-baf1-47f0-ad6d-5ff1cb0e432e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Remove existing imputation files\n", + "filename_imputed_dt <- paste(COUNTRY_CODE, type_of_seasonality, 'imputed.csv', sep = '_')\n", + "files_in_folder <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "files_to_remove <- files_in_folder[grepl(filename_imputed_dt, basename(files_in_folder), ignore.case = TRUE)]\n", + "file.remove(files_to_remove)\n", + "print(glue(\"Deleted files: {str(files_to_remove)}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cf34a4f-f429-4ee5-9919-5e5c7abe9da6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# create the name of the column which will store the imputed/estimated values\n", + "imputed_col = paste(original_values_col, 'EST', sep = '_')\n", + "\n", + "# if there are rows of missing data for cases, impute them (SARIMA)\n", + "if(nrow(original_dt[!is.na(get(original_values_col)),]) != nrow(original_dt)) {\n", + " log_msg(\"There is missing data. Proceeding to impute them.\", level=\"warning\")\n", + " \n", + " # extract data on only the administrative units which have missing values for original_values_col\n", + " missing_dt <- extract_dt_with_missings(original_dt, target_colname = original_values_col, id_colname = admin_id_col)\n", + " missing_dt <- missing_dt[, PERIOD := make_yearmonth(year = YEAR, month = MONTH)]\n", + " missing_dt <- missing_dt[, .SD, .SDcols = c(admin_id_col, 'PERIOD', original_values_col)]\n", + " \n", + " # how many rows missing for each administrative unit? if too many, then not good idea to impute\n", + " missings_by_admin_unit <- missing_dt[, .(missing_count = sum(is.na(get(original_values_col)))), by = admin_id_col][order(-missing_count)]\n", + " \n", + " # if for any given admin unit, more than a given % of data is missing, there's too much to impute (maybe should be stricter - to discuss)\n", + " if(missings_by_admin_unit[, max(missing_count)] > maximum_proportion_missings_per_district * num_periods){\n", + " log_msg(\"Some administrative units have too many missing values in the target data. Abandoning analysis.\", level=\"error\")\n", + " stop(\"ERROR 3\")\n", + " }\n", + " \n", + " # split to list per admin_unit_id, to apply SARIMA imputation on each time series (per admin unit)\n", + " missing_districts_list <- split(missing_dt, by = admin_id_col)\n", + " \n", + " # seasonal ARIMA to estimate missing cases: apply function to list of data.tables with missing rows, then create data.table from result\n", + " filled_missings_dt <- rbindlist(\n", + " lapply(missing_districts_list,\n", + " fill_missing_cases_ts,\n", + " original_values_colname=original_values_col,\n", + " estimated_values_colname=imputed_col,\n", + " admin_colname=admin_id_col,\n", + " period_colname='PERIOD',\n", + " threshold_for_missing = 0.0)\n", + " )\n", + " \n", + " # add the imputed (\"_EST\") values to the original data\n", + " imputed_dt <- merge.data.table(original_dt, filled_missings_dt[, .SD, .SDcols = !(original_values_col)], by = c(admin_id_col, year_col, month_col), all.x = TRUE)\n", + " \n", + " # copy from the districts without missings;\n", + " # if data is large, this could be made faster by only copying from the districts which are not in the missing_dt\n", + " imputed_dt[!is.na(get(original_values_col)), (imputed_col) := get(original_values_col)]\n", + "\n", + " # Save imputed file, only if it was computed (if there is missing data to impute)\n", + " safe_create_dir(INTERMEDIATE_RESULTS_PATH)\n", + " fwrite(imputed_dt, file = file.path(INTERMEDIATE_RESULTS_PATH, filename_imputed_dt))\n", + " \n", + "} else {\n", + " imputed_dt <- copy(original_dt)\n", + " imputed_dt[, (imputed_col) := get(original_values_col)]\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9db44942-d844-491c-9045-906e99a37c60", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Seasonality" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bfa3ebf-3a04-405a-9cc6-5f174a08f70b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 1: Compute month-level seasonality indicators\n", + "# For each row (period-admin unit), determine if it marks the start of a seasonal block\n", + "\n", + "row_seasonality_dt <- compute_month_seasonality(\n", + " input_dt=imputed_dt,\n", + " indicator=type_of_seasonality,\n", + " values_colname=imputed_col,\n", + " vector_of_durations=possible_month_block_sizes,\n", + " admin_colname=admin_id_col,\n", + " year_colname=year_col,\n", + " month_colname=month_col,\n", + " proportion_threshold=threshold_for_seasonality,\n", + " use_calendar_year_denominator=use_calendar_year_denominator\n", + ")\n", + "\n", + "# Create the filename\n", + "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'row_seasonality', sep = '_')\n", + "filename_csv = glue(\"{file_stem}.csv\")\n", + "filename_parquet = glue(\"{file_stem}.parquet\")\n", + "fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", + "write_parquet(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", + "\n", + "\n", + "# The seasonality per admin unit, irrespective of year ----------------------\n", + "\n", + "seasonality_source_dt <- process_seasonality(\n", + " input_dt=row_seasonality_dt,\n", + " indicator=type_of_seasonality,\n", + " vector_of_durations=possible_month_block_sizes,\n", + " admin_colname=admin_id_col,\n", + " year_colname=year_col,\n", + " month_colname=month_col,\n", + " proportion_seasonal_years_threshold=threshold_proportion_seasonal_years\n", + ")\n", + "\n", + "# Compute the duration block; there are normal warnings when it's only 0-es for seasonality:\n", + "# for those admin units without any seasonality, the duration of the block will be 'infinite')\n", + "check_pattern_seasonality <- paste(\"^SEASONALITY\", toupper(type_of_seasonality), \"[0-9]+_MTH$\", sep = \"_\")\n", + "seasonality_source_dt <- seasonality_source_dt[, .SD, .SDcols = c(admin_id_col, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]" + ] + }, + { + "cell_type": "markdown", + "id": "d9f8270f-7283-4630-b9ba-62366b1c3e62", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Result file" + ] + }, + { + "cell_type": "markdown", + "id": "477fb459-0f98-4a32-96ab-f10b4395495f", + "metadata": {}, + "source": [ + "### long" + ] + }, + { + "cell_type": "markdown", + "id": "db719aed-6347-48f4-8984-add9f8adec2d", + "metadata": {}, + "source": [ + "This format, until further notice, is not saved." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50ba68f5-e9a9-4144-99dc-de8cc44770e9", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "seasonality_long_dt <- melt(\n", + " seasonality_source_dt,\n", + " id.vars = grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE, invert = TRUE), # all cols which don't follow the pattern\n", + " variable.name = 'MONTH_BLOCK_SIZE',\n", + " value.name =seasonality_col\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2a400d4-a545-40ab-9204-b6b2e69ea499", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "seasonality_long_dt[, MONTH_BLOCK_SIZE := possible_month_block_sizes[match(MONTH_BLOCK_SIZE, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]]\n", + "\n", + "# add remaining admin unit columns and save the final results\n", + "admin_seasonality_long_dt <- merge.data.table(admin_data, seasonality_long_dt, by = c(admin_id_col), all = TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a7926d5-d4e7-4707-85b8-7020c3738be3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# order the columns\n", + "specific_cols <- setdiff(names(admin_seasonality_long_dt), names(admin_data)) # last columns\n", + "admin_seasonality_long_dt <- admin_seasonality_long_dt[, .SD, .SDcols = c(common_cols, specific_cols)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "343a07c2-2e0e-49eb-b487-cc8d6431e015", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Keeping for now.\n", + "# # filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", + "# filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", + "# fwrite(admin_seasonality_long_dt, file.path(OUTPUT_DATA_PATH, filename_admin_seasonality_long_dt))" + ] + }, + { + "cell_type": "markdown", + "id": "36d1f9cb-75b6-4f6a-a18c-b34eb233b8d2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Transform to wide format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e71259c9-29a6-452f-8949-74adb0e62c1c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "seasonality_wide_dt <- compute_min_seasonality_block(\n", + " input_dt=seasonality_source_dt,\n", + " seasonality_column_pattern=check_pattern_seasonality,\n", + " vector_of_possible_month_block_sizes=possible_month_block_sizes,\n", + " # indicator=toupper(type_of_seasonality),\n", + " seasonal_blocksize_colname=season_duration_col,\n", + " valid_value = 1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9284c723-97d4-46f9-8837-7f70dae92a31", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create a new, overall column 'SEASONALITY_' based on the values of columns in 'check_pattern_seasonality'\n", + "seasonality_pattern_cols <- grep(check_pattern_seasonality, names(seasonality_wide_dt), value = TRUE)\n", + "if (length(seasonality_pattern_cols) > 0L) {\n", + " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_col) := ifelse(rowSums(.SD == 1, na.rm = TRUE) > 0, 1L, 0L), .SDcols = seasonality_pattern_cols]\n", + " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_pattern_cols) := NULL]\n", + "} else {\n", + " seasonality_wide_dt[, (seasonality_col) := NA_integer_]\n", + "}\n", + "\n", + "# Compute CASES_PROPORTION: proportion of cases in the seasonal block vs ANNUAL total\n", + "# Only for seasonal admin units (SEASONALITY_CASES = 1)\n", + "annual_totals_dt <- imputed_dt[, .(ANNUAL_TOTAL = sum(get(imputed_col), na.rm = TRUE)), by = c(admin_id_col, year_col)]\n", + "\n", + "seasonality_wide_dt[, (cases_proportion_col) := mapply(\n", + " compute_cases_proportion,\n", + " admin_id = get(admin_id_col),\n", + " block_duration = get(season_duration_col),\n", + " MoreArgs = list(\n", + " row_data = row_seasonality_dt,\n", + " annual_data = annual_totals_dt,\n", + " admin_col = admin_id_col,\n", + " year_column = year_col\n", + " )\n", + ")]\n", + "\n", + "# Set CASES_PROPORTION to NA for non-seasonal admin units\n", + "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (cases_proportion_col) := NA_real_]\n", + "\n", + "# Compute SEASONAL_BLOCK_START_MONTH: first month of the seasonal block\n", + "# Only for seasonal admin units (SEASONALITY_CASES = 1)\n", + "seasonality_wide_dt[, (season_start_month_col) := mapply(\n", + " compute_start_month,\n", + " admin_id = get(admin_id_col),\n", + " block_duration = get(season_duration_col),\n", + " MoreArgs = list(\n", + " row_data = row_seasonality_dt,\n", + " admin_col = admin_id_col,\n", + " month_column = month_col\n", + " )\n", + ")]\n", + "\n", + "# Set SEASONAL_BLOCK_START_MONTH to NA for non-seasonal admin units\n", + "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (season_start_month_col) := NA_integer_]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5ea18ef-1148-432d-a954-b9c6b4a06afc", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# add remaining admin unit columns and save the final results\n", + "admin_seasonality_wide_dt <- merge.data.table(admin_data, seasonality_wide_dt, by = c(admin_id_col), all = TRUE)\n", + "admin_seasonality_wide_dt <- admin_seasonality_wide_dt[, .SD, .SDcols = c(common_cols, seasonality_cols)]\n", + "# head(admin_seasonality_wide_dt)" + ] + }, + { + "cell_type": "markdown", + "id": "a2f1a373-4b34-42db-b591-b25c7050dee6", + "metadata": {}, + "source": [ + "**Save output**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ded1e86d-f4c5-4fdb-a6ba-611d5c7f4aed", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create the filename\n", + "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'seasonality', sep = '_')\n", + "filename_csv = glue(\"{file_stem}.csv\")\n", + "filename_parquet = glue(\"{file_stem}.parquet\")\n", + "fwrite(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", + "write_parquet(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", + "log_msg(paste0(\"Case seasonality results saved in folder \", OUTPUT_DATA_PATH))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9533db1c-a9db-42be-a4cd-4076d9f212ba", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, \"row_seasonality.csv\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b39e53f-f4fe-49e4-8e4d-9a6679313a54", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# fwrite(seasonality_source_dt, file.path(OUTPUT_DATA_PATH, \"processed_seasonality.csv\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6becf7c6-a821-4ed0-a91f-3b8dc2da6313", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] } - }, - "outputs": [], - "source": [ - "# fwrite(seasonality_source_dt, file.path(OUTPUT_DATA_PATH, \"processed_seasonality.csv\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6becf7c6-a821-4ed0-a91f-3b8dc2da6313", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r b/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r new file mode 100644 index 0000000..1c4e8bf --- /dev/null +++ b/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r @@ -0,0 +1,61 @@ +compute_cases_proportion <- function(admin_id, block_duration, row_data, annual_data, admin_col, year_column) { + if (is.na(block_duration) || is.infinite(block_duration)) { + return(NA_real_) + } + + sum_col <- paste("CASES_SUM", block_duration, "MTH_FW", sep = "_") + if (!sum_col %in% names(row_data)) { + return(NA_real_) + } + + admin_row_data <- row_data[get(admin_col) == admin_id] + admin_annual_data <- annual_data[get(admin_col) == admin_id] + if (nrow(admin_row_data) == 0 || nrow(admin_annual_data) == 0) { + return(NA_real_) + } + + yearly_max_block <- admin_row_data[ + !is.na(get(sum_col)), + .(max_block_sum = if (.N > 0L) max(get(sum_col), na.rm = TRUE) else NA_real_), + by = year_column + ] + + yearly_max_block <- yearly_max_block[is.finite(max_block_sum)] + if (nrow(yearly_max_block) == 0) { + return(NA_real_) + } + + merged <- merge(yearly_max_block, admin_annual_data, by = year_column) + merged <- merged[ANNUAL_TOTAL > 0] + if (nrow(merged) == 0) { + return(NA_real_) + } + + merged[, prop := max_block_sum / ANNUAL_TOTAL] + mean(merged$prop, na.rm = TRUE) +} + + +compute_start_month <- function(admin_id, block_duration, row_data, admin_col, month_column) { + if (is.na(block_duration) || is.infinite(block_duration)) { + return(NA_integer_) + } + + seasonality_row_col <- paste("CASES", block_duration, "MTH_ROW_SEASONALITY", sep = "_") + if (!seasonality_row_col %in% names(row_data)) { + return(NA_integer_) + } + + admin_row_data <- row_data[get(admin_col) == admin_id] + if (nrow(admin_row_data) == 0) { + return(NA_integer_) + } + + seasonal_months <- admin_row_data[get(seasonality_row_col) == 1, get(month_column)] + if (length(seasonal_months) == 0) { + return(NA_integer_) + } + + month_counts <- table(seasonal_months) + as.integer(names(month_counts)[which.max(month_counts)]) +} diff --git a/pipelines/snt_seasonality_rainfall/code/snt_seasonality_rainfall.ipynb b/pipelines/snt_seasonality_rainfall/code/snt_seasonality_rainfall.ipynb index 4b384b0..279cbc6 100644 --- a/pipelines/snt_seasonality_rainfall/code/snt_seasonality_rainfall.ipynb +++ b/pipelines/snt_seasonality_rainfall/code/snt_seasonality_rainfall.ipynb @@ -1,1109 +1,1056 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "5eebc540-e973-497e-8427-e73d546fdd09", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Rainfall Seasonality Pipeline\n", + "\n", + "This pipeline classifies administrative units (`ADM2`) as **seasonal** or **non-seasonal** based on rainfall patterns, determines the **duration** of their rainy season, and identifies the respective **onset month**.\n", + "\n", + "### Methodology\n", + "\n", + "The pipeline employs a four-step hierarchical method:\n", + "\n", + "#### 1. Identify \"start\" month\n", + "Evaluates every specific month in the dataset (per district and year) to determine if it marks the beginning of a concentrated rainfall period.\n", + "- **Logic**: For each month, compute a forward-looking proportion: the ratio of rainfall in the next n-month block (e.g., 3, 4, or 5 months) to the annual total.\n", + "- **Denominator**: Can use either a 12-month forward-looking sliding window (WHO approach) or calendar year (Jan-Dec).\n", + "- **Threshold**: If this proportion exceeds `threshold_for_seasonality` (e.g., 60%), the month is flagged as a valid \"start\" month.\n", + "\n", + "#### 2. Classify ADM2 as \"Seasonal\" or \"Non-seasonal\"\n", + "Aggregates month-level flags to determine if the district consistently experiences a rainy season.\n", + "- **Logic**: Calculate the proportion of years that a specific month was flagged as a \"start\" in Step 1.\n", + "- **Consistency**: If this proportion exceeds `threshold_proportion_seasonal_years` (e.g., 70%), the district is classified as \"Seasonal\".\n", + "\n", + "#### 3. Determine season duration\n", + "Resolves cases where a district qualifies for multiple block durations by selecting the minimum duration.\n", + "- **Output**: `SEASONAL_BLOCK_DURATION_RAINFALL` - the shortest window with the required rainfall proportion.\n", + "\n", + "#### 4. Determine season onset\n", + "Identifies the single official start month using the mode (most frequent value) across years.\n", + "- **Output**: `SEASONAL_BLOCK_START_MONTH_RAINFALL`\n", + "\n", + "---\n", + "\n", + "## Preliminaries" + ], + "id": "5eebc540-e973-497e-8427-e73d546fdd09" }, - "tags": [] - }, - "source": [ - "## Rainfall Seasonality Pipeline\n", - "\n", - "This pipeline classifies administrative units (`ADM2`) as **seasonal** or **non-seasonal** based on rainfall patterns, determines the **duration** of their rainy season, and identifies the respective **onset month**.\n", - "\n", - "### Methodology\n", - "\n", - "The pipeline employs a four-step hierarchical method:\n", - "\n", - "#### 1. Identify \"start\" month\n", - "Evaluates every specific month in the dataset (per district and year) to determine if it marks the beginning of a concentrated rainfall period.\n", - "- **Logic**: For each month, compute a forward-looking proportion: the ratio of rainfall in the next n-month block (e.g., 3, 4, or 5 months) to the annual total.\n", - "- **Denominator**: Can use either a 12-month forward-looking sliding window (WHO approach) or calendar year (Jan-Dec).\n", - "- **Threshold**: If this proportion exceeds `threshold_for_seasonality` (e.g., 60%), the month is flagged as a valid \"start\" month.\n", - "\n", - "#### 2. Classify ADM2 as \"Seasonal\" or \"Non-seasonal\"\n", - "Aggregates month-level flags to determine if the district consistently experiences a rainy season.\n", - "- **Logic**: Calculate the proportion of years that a specific month was flagged as a \"start\" in Step 1.\n", - "- **Consistency**: If this proportion exceeds `threshold_proportion_seasonal_years` (e.g., 70%), the district is classified as \"Seasonal\".\n", - "\n", - "#### 3. Determine season duration\n", - "Resolves cases where a district qualifies for multiple block durations by selecting the minimum duration.\n", - "- **Output**: `SEASONAL_BLOCK_DURATION_RAINFALL` - the shortest window with the required rainfall proportion.\n", - "\n", - "#### 4. Determine season onset\n", - "Identifies the single official start month using the mode (most frequent value) across years.\n", - "- **Output**: `SEASONAL_BLOCK_START_MONTH_RAINFALL`\n", - "\n", - "---\n", - "\n", - "## Preliminaries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "273b05d8-d287-4acc-bd43-5ba6642bd9fa", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# install.packages(\"fpp3\", repos = \"https://cloud.r-project.org\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6b4eaed", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Clear environment\n", - "rm(list=ls())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18d197b6-9de8-4e4b-bc4e-8b452de67287", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# install.packages(\"fpp3\", repos = \"https://cloud.r-project.org\")" + ], + "execution_count": null, + "outputs": [], + "id": "273b05d8-d287-4acc-bd43-5ba6642bd9fa" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Global settings\n", - "options(scipen=999)\n", - "\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a6915379-108e-4405-b553-b074aad447d6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Clear environment\n", + "rm(list=ls())" + ], + "execution_count": null, + "outputs": [], + "id": "b6b4eaed" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'seasonality_rainfall')\n", - "INTERMEDIATE_RESULTS_PATH <- file.path(OUTPUT_DATA_PATH, \"intermediate_results\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f94d1e6c-0675-4349-b6e0-a28197c8c9e4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "180b93e7-61af-4981-863f-593b755968bd", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# List required pcks\n", - "required_packages <- c(\n", - " \"jsonlite\",\n", - " \"data.table\",\n", - " \"ggplot2\",\n", - " \"fpp3\",\n", - " \"arrow\",\n", - " \"glue\",\n", - " \"sf\",\n", - " \"RColorBrewer\",\n", - " \"httr\",\n", - " \"reticulate\"\n", - ")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "609cd062-d7d9-42de-976b-10f8a0bfc18a", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Global settings\n", + "options(scipen=999)\n", + "\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ], + "execution_count": null, + "outputs": [], + "id": "18d197b6-9de8-4e4b-bc4e-8b452de67287" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Check that compute_month_seasonality() supports the required parameter\n", - "if (!(\"use_calendar_year_denominator\" %in% names(formals(compute_month_seasonality)))) {\n", - " error_msg <- paste0(\n", - " \"Error: The function compute_month_seasonality() does not support the parameter 'use_calendar_year_denominator'. \",\n", - " \"Please ensure that the snt_utils.r file is updated to the latest version.\"\n", - " )\n", - " log_msg(error_msg, level = \"error\")\n", - " stop(error_msg)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "458b3d65-cc7e-41bc-95fd-7011dcd5528f", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_seasonality_rainfall')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'seasonality_rainfall')\n", + "INTERMEDIATE_RESULTS_PATH <- file.path(OUTPUT_DATA_PATH, \"intermediate_results\")" + ], + "execution_count": null, + "outputs": [], + "id": "a6915379-108e-4405-b553-b074aad447d6" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "era5_dataset <- config_json$SNT_DATASET_IDENTIFIERS$ERA5_DATASET_CLIMATE\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "print(paste(\"Country code: \", COUNTRY_CODE))" - ] - }, - { - "cell_type": "markdown", - "id": "804a1bd1-26c8-4f6a-af35-3eba64fe0741", - "metadata": {}, - "source": [ - "## Globals and parameters" - ] - }, - { - "cell_type": "markdown", - "id": "414f9ee0-5264-43c4-992f-cff6c719d65c", - "metadata": {}, - "source": [ - "**Parameters**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb82560d-c123-4c54-bfa1-fb5f05e4ad69", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "minimum_periods <- as.integer(48)\n", - "maximum_proportion_missings_overall <- 0.1\n", - "maximum_proportion_missings_per_district <- 0.2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9024226d-5845-48a0-8ae4-e7b5a8d11988", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_seasonality_rainfall.r\"))" + ], + "execution_count": null, + "outputs": [], + "id": "f94d1e6c-0675-4349-b6e0-a28197c8c9e4" }, - "tags": [ - "parameters" - ], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Fallback parameter values for local/dev execution\n", - "# When run via pipeline, these are injected by Papermill in the first cell\n", - "if (!exists(\"minimum_month_block_size\")) {\n", - " minimum_month_block_size <- as.integer(3)\n", - "}\n", - "if (!exists(\"maximum_month_block_size\")) {\n", - " maximum_month_block_size <- as.integer(5)\n", - "}\n", - "if (!exists(\"threshold_for_seasonality\")) {\n", - " threshold_for_seasonality <- 0.6\n", - "}\n", - "if (!exists(\"threshold_proportion_seasonal_years\")) {\n", - " threshold_proportion_seasonal_years <- 0.5\n", - "}\n", - "if (!exists(\"use_calendar_year_denominator\")) {\n", - " use_calendar_year_denominator <- FALSE\n", - "}\n", - "\n", - "# Ensure correct types\n", - "minimum_month_block_size <- as.integer(minimum_month_block_size)\n", - "maximum_month_block_size <- as.integer(maximum_month_block_size)\n", - "\n", - "# Log parameter values\n", - "log_msg(paste(\"Minimum month block size:\", minimum_month_block_size))\n", - "log_msg(paste(\"Maximum month block size:\", maximum_month_block_size))\n", - "log_msg(paste(\"Threshold for seasonality:\", threshold_for_seasonality))\n", - "log_msg(paste(\"Threshold proportion seasonal years:\", threshold_proportion_seasonal_years))\n", - "log_msg(paste(\"Use calendar year denominator:\", use_calendar_year_denominator))" - ] - }, - { - "cell_type": "markdown", - "id": "0b2d3bb6-6351-4f32-92de-44a6579b6630", - "metadata": {}, - "source": [ - "**Fixed routine formatting columns**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90b27881-b25d-4cb3-8b2f-4dd1b395bdee", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# List required pcks\n", + "required_packages <- c(\n", + " \"jsonlite\",\n", + " \"data.table\",\n", + " \"ggplot2\",\n", + " \"fpp3\",\n", + " \"arrow\",\n", + " \"glue\",\n", + " \"sf\",\n", + " \"RColorBrewer\",\n", + " \"httr\",\n", + " \"reticulate\"\n", + ")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ], + "execution_count": null, + "outputs": [], + "id": "180b93e7-61af-4981-863f-593b755968bd" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Global variables\n", - "type_of_seasonality <- \"rainfall\"\n", - "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", - "data_source <- 'ERA5'\n", - "original_values_col <- 'MEAN'\n", - "\n", - "# Create suffix for output filenames based on denominator method\n", - "# This allows comparing outputs from different methods\n", - "if (use_calendar_year_denominator) {\n", - " SUFFIX <- \"_calendar\"\n", - " SUFFIX_TEXT <- \"calendar\"\n", - "} else {\n", - " SUFFIX <- \"_sliding\"\n", - " SUFFIX_TEXT <- \"sliding\"\n", - "}\n", - "\n", - "# Space and time columns\n", - "admin_level <- 'ADM2'\n", - "admin_id_col <- paste(admin_level, toupper('id'), sep = '_')\n", - "admin_name_col <- paste(admin_level, toupper('name'), sep = '_')\n", - "year_col <- 'YEAR'\n", - "month_col <- 'MONTH'\n", - "period_cols <- c(year_col, month_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "473308f4-4630-4d9e-82a9-b2b4fc9134db", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Check that compute_month_seasonality() supports the required parameter\n", + "if (!(\"use_calendar_year_denominator\" %in% names(formals(compute_month_seasonality)))) {\n", + " error_msg <- paste0(\n", + " \"Error: The function compute_month_seasonality() does not support the parameter 'use_calendar_year_denominator'. \",\n", + " \"Please ensure that the snt_utils.r file is updated to the latest version.\"\n", + " )\n", + " log_msg(error_msg, level = \"error\")\n", + " stop(error_msg)\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "609cd062-d7d9-42de-976b-10f8a0bfc18a" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "possible_month_block_sizes <- as.integer(minimum_month_block_size:maximum_month_block_size)\n", - "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", - "print(paste(\"Formatted threshold :\",formatted_threshold_for_seasonality))" - ] - }, - { - "cell_type": "markdown", - "id": "86f492f3-5634-4987-a2b8-23014aba5d51", - "metadata": {}, - "source": [ - "## Load data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "623480ee-4310-4ead-a8c8-bf294527c814", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f766ea1-dced-4143-a5be-fdc51da4bd8d", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "era5_dataset <- config_json$SNT_DATASET_IDENTIFIERS$ERA5_DATASET_CLIMATE\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "print(paste(\"Country code: \", COUNTRY_CODE))" + ], + "execution_count": null, + "outputs": [], + "id": "458b3d65-cc7e-41bc-95fd-7011dcd5528f" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load rainfall data from dataset\n", - "rainfall_data_filename <- paste(COUNTRY_CODE, \"total_precipitation_monthly.parquet\", sep = \"_\")\n", - "original_dt <- get_latest_dataset_file_in_memory(era5_dataset, rainfall_data_filename)\n", - "log_msg(glue(\"File {rainfall_data_filename} successfully loaded from dataset version: {era5_dataset}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b769deb-52e5-471d-9950-ac431dd8cf03", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Columns formatting\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)\n", - "common_cols <- names(admin_data)\n", - "\n", - "seasonality_col <- glue('SEASONALITY', toupper(type_of_seasonality), .sep = \"_\")\n", - "season_duration_col <- glue('SEASONAL_BLOCK_DURATION', toupper(type_of_seasonality), .sep = \"_\")\n", - "season_start_month_col <- glue('SEASONAL_BLOCK_START_MONTH', toupper(type_of_seasonality), .sep = \"_\")\n", - "rain_proportion_col <- 'RAIN_PROPORTION'\n", - "final_table_cols <- c(names(admin_data), seasonality_col, season_duration_col, season_start_month_col, rain_proportion_col)\n", - "print(final_table_cols)" - ] - }, - { - "cell_type": "markdown", - "id": "0d329af2-f544-4ee2-940f-65e2ab11c49d", - "metadata": {}, - "source": [ - "**Create the containers for the data**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90486c1e-38bc-4c6f-bffe-b7e8f3be68ca", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create an empty table if the analysis is stopped for lack of enough data\n", - "seasonality_cols <- c(seasonality_col, season_duration_col, season_start_month_col, rain_proportion_col)\n", - "empty_dt <- copy(admin_data)[, (seasonality_cols) := NA]" - ] - }, - { - "cell_type": "markdown", - "id": "b8da71be-45f1-405c-857c-ed86984988f4", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Globals and parameters" + ], + "id": "804a1bd1-26c8-4f6a-af35-3eba64fe0741" }, - "tags": [] - }, - "source": [ - "## Preprocess input data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5bf0faa-357e-44a7-af0c-04dd382af7e0", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parameters**" + ], + "id": "414f9ee0-5264-43c4-992f-cff6c719d65c" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# format table\n", - "setDT(original_dt)\n", - "integer_cols <- c(year_col, month_col)\n", - "numeric_cols <- c(original_values_col)\n", - "original_dt[, (integer_cols) := lapply(.SD, as.integer), .SDcols = integer_cols]\n", - "# head(original_dt)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1a762ad-943e-467b-8cc1-e4998a996b9f", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "minimum_periods <- as.integer(48)\n", + "maximum_proportion_missings_overall <- 0.1\n", + "maximum_proportion_missings_per_district <- 0.2" + ], + "execution_count": null, + "outputs": [], + "id": "fb82560d-c123-4c54-bfa1-fb5f05e4ad69" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# keep only the useful columns and aggregate the data on them\n", - "original_dt <- original_dt[,\n", - " setNames(list(sum(get(original_values_col), na.rm = TRUE)), original_values_col), \n", - " by = c(admin_id_col, period_cols)\n", - " ]\n", - "\n", - "num_periods <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[1]]\n", - "all_rows <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[2]]\n", - "\n", - "if (num_periods < minimum_periods){ \n", - " log_msg(glue(\"Data is not reliable: \n", - " at least {minimum_periods} year-month periods of data are required for the case analyais; \n", - " the data only contains {num_periods} periods. Abandoning analysis.\")\n", - " , level=\"error\")\n", - " stop(\"ERROR 1\")\n", - "}\n", - "\n", - "# inject the (possibly missing) rows into the data\n", - "original_dt <- make_full_time_space_data(\n", - " input_dt=original_dt,\n", - " full_rows_dt=all_rows,\n", - " target_colname=original_values_col,\n", - " admin_colname=admin_id_col,\n", - " year_colname=year_col,\n", - " month_colname=month_col)\n", - "\n", - "if(nrow(original_dt[is.na(get(original_values_col)),]) > (maximum_proportion_missings_overall * nrow(original_dt))){ \n", - " log_msg(\"There are too many missing values in the data overall. Abandoning analysis.\", level=\"error\")\n", - " stop(\"ERROR 2\") \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e3d793a5-ac96-4dcc-bd86-5837a631ea54", - "metadata": {}, - "source": [ - "### Imputation of missings" - ] - }, - { - "cell_type": "markdown", - "id": "1b7c767b-5343-4d7a-ad6a-aac11ee2ba12", - "metadata": {}, - "source": [ - "**Remove impute files (if any)**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac3414c6-baf1-47f0-ad6d-5ff1cb0e432e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Remove existing imputation files\n", - "filename_imputed_dt <- paste(COUNTRY_CODE, type_of_seasonality, 'imputed.csv', sep = '_')\n", - "files_in_folder <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "files_to_remove <- files_in_folder[grepl(filename_imputed_dt, basename(files_in_folder), ignore.case = TRUE)]\n", - "file.remove(files_to_remove)\n", - "print(glue(\"Deleted files: {str(files_to_remove)}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8cf34a4f-f429-4ee5-9919-5e5c7abe9da6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "parameters" + ], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Fallback parameter values for local/dev execution\n", + "# When run via pipeline, these are injected by Papermill in the first cell\n", + "if (!exists(\"minimum_month_block_size\")) {\n", + " minimum_month_block_size <- as.integer(3)\n", + "}\n", + "if (!exists(\"maximum_month_block_size\")) {\n", + " maximum_month_block_size <- as.integer(5)\n", + "}\n", + "if (!exists(\"threshold_for_seasonality\")) {\n", + " threshold_for_seasonality <- 0.6\n", + "}\n", + "if (!exists(\"threshold_proportion_seasonal_years\")) {\n", + " threshold_proportion_seasonal_years <- 0.5\n", + "}\n", + "if (!exists(\"use_calendar_year_denominator\")) {\n", + " use_calendar_year_denominator <- FALSE\n", + "}\n", + "\n", + "# Ensure correct types\n", + "minimum_month_block_size <- as.integer(minimum_month_block_size)\n", + "maximum_month_block_size <- as.integer(maximum_month_block_size)\n", + "\n", + "# Log parameter values\n", + "log_msg(paste(\"Minimum month block size:\", minimum_month_block_size))\n", + "log_msg(paste(\"Maximum month block size:\", maximum_month_block_size))\n", + "log_msg(paste(\"Threshold for seasonality:\", threshold_for_seasonality))\n", + "log_msg(paste(\"Threshold proportion seasonal years:\", threshold_proportion_seasonal_years))\n", + "log_msg(paste(\"Use calendar year denominator:\", use_calendar_year_denominator))" + ], + "execution_count": null, + "outputs": [], + "id": "9024226d-5845-48a0-8ae4-e7b5a8d11988" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# create the name of the column which will store the imputed/estimated values\n", - "imputed_col = paste(original_values_col, 'EST', sep = '_')\n", - "\n", - "# if there are rows of missing data for cases, impute them (SARIMA)\n", - "if(nrow(original_dt[!is.na(get(original_values_col)),]) != nrow(original_dt)) {\n", - " log_msg(\"There is missing data. Proceeding to impute them.\", level=\"warning\")\n", - " \n", - " # extract data on only the administrative units which have missing values for original_values_col\n", - " missing_dt <- extract_dt_with_missings(original_dt, target_colname = original_values_col, id_colname = admin_id_col)\n", - " missing_dt <- missing_dt[, PERIOD := make_yearmonth(year = YEAR, month = MONTH)]\n", - " missing_dt <- missing_dt[, .SD, .SDcols = c(admin_id_col, 'PERIOD', original_values_col)]\n", - " \n", - " # how many rows missing for each administrative unit? if too many, then not good idea to impute\n", - " missings_by_admin_unit <- missing_dt[, .(missing_count = sum(is.na(get(original_values_col)))), by = admin_id_col][order(-missing_count)]\n", - " \n", - " # if for any given admin unit, more than a given % of data is missing, there's too much to impute (maybe should be stricter - to discuss)\n", - " if(missings_by_admin_unit[, max(missing_count)] > maximum_proportion_missings_per_district * num_periods){\n", - " log_msg(\"Some administrative units have too many missing values in the target data. Abandoning analysis.\", level=\"error\")\n", - " stop(\"ERROR 3\")\n", - " }\n", - " \n", - " # split to list per admin_unit_id, to apply SARIMA imputation on each time series (per admin unit)\n", - " missing_districts_list <- split(missing_dt, by = admin_id_col)\n", - " \n", - " # seasonal ARIMA to estimate missing cases: apply function to list of data.tables with missing rows, then create data.table from result\n", - " filled_missings_dt <- rbindlist(\n", - " lapply(missing_districts_list,\n", - " fill_missing_cases_ts,\n", - " original_values_colname=original_values_col,\n", - " estimated_values_colname=imputed_col,\n", - " admin_colname=admin_id_col,\n", - " period_colname='PERIOD',\n", - " threshold_for_missing = 0.0)\n", - " )\n", - " \n", - " # add the imputed (\"_EST\") values to the original data\n", - " imputed_dt <- merge.data.table(original_dt, filled_missings_dt[, .SD, .SDcols = !(original_values_col)], by = c(admin_id_col, year_col, month_col), all.x = TRUE)\n", - " \n", - " # copy from the districts without missings;\n", - " # if data is large, this could be made faster by only copying from the districts which are not in the missing_dt\n", - " imputed_dt[!is.na(get(original_values_col)), (imputed_col) := get(original_values_col)]\n", - "\n", - " # Save imputed file, only if it was computed (if there is missing data to impute)\n", - " safe_create_dir(INTERMEDIATE_RESULTS_PATH)\n", - " fwrite(imputed_dt, file = file.path(INTERMEDIATE_RESULTS_PATH, filename_imputed_dt))\n", - " \n", - "} else {\n", - " imputed_dt <- copy(original_dt)\n", - " imputed_dt[, (imputed_col) := get(original_values_col)]\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9db44942-d844-491c-9045-906e99a37c60", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Fixed routine formatting columns**" + ], + "id": "0b2d3bb6-6351-4f32-92de-44a6579b6630" }, - "tags": [] - }, - "source": [ - "## Seasonality" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9bfa3ebf-3a04-405a-9cc6-5f174a08f70b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Global variables\n", + "type_of_seasonality <- \"rainfall\"\n", + "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", + "data_source <- 'ERA5'\n", + "original_values_col <- 'MEAN'\n", + "\n", + "# Create suffix for output filenames based on denominator method\n", + "# This allows comparing outputs from different methods\n", + "if (use_calendar_year_denominator) {\n", + " SUFFIX <- \"_calendar\"\n", + " SUFFIX_TEXT <- \"calendar\"\n", + "} else {\n", + " SUFFIX <- \"_sliding\"\n", + " SUFFIX_TEXT <- \"sliding\"\n", + "}\n", + "\n", + "# Space and time columns\n", + "admin_level <- 'ADM2'\n", + "admin_id_col <- paste(admin_level, toupper('id'), sep = '_')\n", + "admin_name_col <- paste(admin_level, toupper('name'), sep = '_')\n", + "year_col <- 'YEAR'\n", + "month_col <- 'MONTH'\n", + "period_cols <- c(year_col, month_col)" + ], + "execution_count": null, + "outputs": [], + "id": "90b27881-b25d-4cb3-8b2f-4dd1b395bdee" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: Compute month-level seasonality indicators\n", - "# For each row (period-admin unit), determine if it marks the start of a seasonal block\n", - "\n", - "row_seasonality_dt <- compute_month_seasonality(\n", - " input_dt=imputed_dt,\n", - " indicator=type_of_seasonality,\n", - " values_colname=imputed_col,\n", - " vector_of_durations=possible_month_block_sizes,\n", - " admin_colname=admin_id_col,\n", - " year_colname=year_col,\n", - " month_colname=month_col,\n", - " proportion_threshold=threshold_for_seasonality,\n", - " use_calendar_year_denominator=use_calendar_year_denominator\n", - ")\n", - "\n", - "# Create the filename\n", - "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'row_seasonality', sep = '_')\n", - "filename_csv = glue(\"{file_stem}.csv\")\n", - "filename_parquet = glue(\"{file_stem}.parquet\")\n", - "fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", - "write_parquet(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", - "\n", - "# The seasonality per admin unit, irrespective of year ----------------------\n", - "\n", - "seasonality_source_dt <- process_seasonality(\n", - " input_dt=row_seasonality_dt,\n", - " indicator=type_of_seasonality,\n", - " vector_of_durations=possible_month_block_sizes,\n", - " admin_colname=admin_id_col,\n", - " year_colname=year_col,\n", - " month_colname=month_col,\n", - " proportion_seasonal_years_threshold=threshold_proportion_seasonal_years\n", - ")\n", - "\n", - "# Compute the duration block; there are normal warnings when it's only 0-es for seasonality:\n", - "# for those admin units without any seasonality, the duration of the block will be 'infinite')\n", - "check_pattern_seasonality <- paste(\"^SEASONALITY\", toupper(type_of_seasonality), \"[0-9]+_MTH$\", sep = \"_\")\n", - "seasonality_source_dt <- seasonality_source_dt[, .SD, .SDcols = c(admin_id_col, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]" - ] - }, - { - "cell_type": "markdown", - "id": "d9f8270f-7283-4630-b9ba-62366b1c3e62", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "possible_month_block_sizes <- as.integer(minimum_month_block_size:maximum_month_block_size)\n", + "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", + "print(paste(\"Formatted threshold :\",formatted_threshold_for_seasonality))" + ], + "execution_count": null, + "outputs": [], + "id": "473308f4-4630-4d9e-82a9-b2b4fc9134db" }, - "tags": [] - }, - "source": [ - "## Result file" - ] - }, - { - "cell_type": "markdown", - "id": "477fb459-0f98-4a32-96ab-f10b4395495f", - "metadata": {}, - "source": [ - "### long" - ] - }, - { - "cell_type": "markdown", - "id": "db719aed-6347-48f4-8984-add9f8adec2d", - "metadata": {}, - "source": [ - "This format, until further notice, is not saved." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50ba68f5-e9a9-4144-99dc-de8cc44770e9", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data" + ], + "id": "86f492f3-5634-4987-a2b8-23014aba5d51" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "seasonality_long_dt <- melt(\n", - " seasonality_source_dt,\n", - " id.vars = grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE, invert = TRUE), # all cols which don't follow the pattern\n", - " variable.name = 'MONTH_BLOCK_SIZE',\n", - " value.name =seasonality_col\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b2a400d4-a545-40ab-9204-b6b2e69ea499", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load spatial file from dataset\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" + ], + "execution_count": null, + "outputs": [], + "id": "623480ee-4310-4ead-a8c8-bf294527c814" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "seasonality_long_dt[, MONTH_BLOCK_SIZE := possible_month_block_sizes[match(MONTH_BLOCK_SIZE, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]]\n", - "\n", - "# add remaining admin unit columns and save the final results\n", - "admin_seasonality_long_dt <- merge.data.table(admin_data, seasonality_long_dt, by = c(admin_id_col), all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a7926d5-d4e7-4707-85b8-7020c3738be3", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# order the columns\n", - "specific_cols <- setdiff(names(admin_seasonality_long_dt), names(admin_data)) # last columns\n", - "admin_seasonality_long_dt <- admin_seasonality_long_dt[, .SD, .SDcols = c(common_cols, specific_cols)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "343a07c2-2e0e-49eb-b487-cc8d6431e015", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load rainfall data from dataset\n", + "rainfall_data_filename <- paste(COUNTRY_CODE, \"total_precipitation_monthly.parquet\", sep = \"_\")\n", + "original_dt <- get_latest_dataset_file_in_memory(era5_dataset, rainfall_data_filename)\n", + "log_msg(glue(\"File {rainfall_data_filename} successfully loaded from dataset version: {era5_dataset}\"))" + ], + "execution_count": null, + "outputs": [], + "id": "1f766ea1-dced-4143-a5be-fdc51da4bd8d" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Keeping for now.\n", - "# # filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", - "# filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", - "# fwrite(admin_seasonality_long_dt, file.path(OUTPUT_DATA_PATH, filename_admin_seasonality_long_dt))" - ] - }, - { - "cell_type": "markdown", - "id": "36d1f9cb-75b6-4f6a-a18c-b34eb233b8d2", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Columns formatting\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)\n", + "common_cols <- names(admin_data)\n", + "\n", + "seasonality_col <- glue('SEASONALITY', toupper(type_of_seasonality), .sep = \"_\")\n", + "season_duration_col <- glue('SEASONAL_BLOCK_DURATION', toupper(type_of_seasonality), .sep = \"_\")\n", + "season_start_month_col <- glue('SEASONAL_BLOCK_START_MONTH', toupper(type_of_seasonality), .sep = \"_\")\n", + "rain_proportion_col <- 'RAIN_PROPORTION'\n", + "final_table_cols <- c(names(admin_data), seasonality_col, season_duration_col, season_start_month_col, rain_proportion_col)\n", + "print(final_table_cols)" + ], + "execution_count": null, + "outputs": [], + "id": "7b769deb-52e5-471d-9950-ac431dd8cf03" }, - "tags": [] - }, - "source": [ - "### Transform to wide format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e71259c9-29a6-452f-8949-74adb0e62c1c", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Create the containers for the data**" + ], + "id": "0d329af2-f544-4ee2-940f-65e2ab11c49d" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "seasonality_wide_dt <- compute_min_seasonality_block(\n", - " input_dt=seasonality_source_dt,\n", - " seasonality_column_pattern=check_pattern_seasonality,\n", - " vector_of_possible_month_block_sizes=possible_month_block_sizes,\n", - " seasonal_blocksize_colname=season_duration_col,\n", - " valid_value = 1\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9284c723-97d4-46f9-8837-7f70dae92a31", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Create an empty table if the analysis is stopped for lack of enough data\n", + "seasonality_cols <- c(seasonality_col, season_duration_col, season_start_month_col, rain_proportion_col)\n", + "empty_dt <- copy(admin_data)[, (seasonality_cols) := NA]" + ], + "execution_count": null, + "outputs": [], + "id": "90486c1e-38bc-4c6f-bffe-b7e8f3be68ca" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create a new, overall column 'SEASONALITY_' based on the values of columns in 'check_pattern_seasonality'\n", - "seasonality_pattern_cols <- grep(check_pattern_seasonality, names(seasonality_wide_dt), value = TRUE)\n", - "if (length(seasonality_pattern_cols) > 0L) {\n", - " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_col) := ifelse(rowSums(.SD == 1, na.rm = TRUE) > 0, 1L, 0L), .SDcols = seasonality_pattern_cols]\n", - " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_pattern_cols) := NULL]\n", - "} else {\n", - " seasonality_wide_dt[, (seasonality_col) := NA_integer_]\n", - "}\n", - "\n", - "# Compute RAIN_PROPORTION: proportion of rainfall in the seasonal block vs ANNUAL total\n", - "# Only for seasonal admin units (SEASONALITY_RAINFALL = 1)\n", - "\n", - "# Step 1: Compute annual totals per admin-year from imputed data\n", - "annual_totals_dt <- imputed_dt[, .(ANNUAL_TOTAL = sum(get(imputed_col), na.rm = TRUE)), by = c(admin_id_col, year_col)]\n", - "\n", - "# Step 2: Function to compute proportion = max block sum / annual total\n", - "compute_rain_proportion <- function(admin_id, block_duration, row_data, annual_data, admin_col, year_column) {\n", - " if (is.na(block_duration) || is.infinite(block_duration)) return(NA_real_)\n", - " \n", - " # Column with block sum (N-month forward-looking sum)\n", - " sum_col <- paste('RAINFALL_SUM', block_duration, 'MTH_FW', sep = '_')\n", - " if (!sum_col %in% names(row_data)) return(NA_real_)\n", - " \n", - " admin_row_data <- row_data[get(admin_col) == admin_id]\n", - " admin_annual_data <- annual_data[get(admin_col) == admin_id]\n", - " if (nrow(admin_row_data) == 0 || nrow(admin_annual_data) == 0) return(NA_real_)\n", - " \n", - " # For each year, get max block sum (only if there are non-NA values)\n", - " yearly_max_block <- admin_row_data[\n", - " !is.na(get(sum_col)),\n", - " .(max_block_sum = if (.N > 0L) max(get(sum_col), na.rm = TRUE) else NA_real_),\n", - " by = year_column\n", - " ]\n", - " \n", - " # Remove rows with NA or -Inf (from max when all values were NA)\n", - " yearly_max_block <- yearly_max_block[is.finite(max_block_sum)]\n", - " if (nrow(yearly_max_block) == 0) return(NA_real_)\n", - " \n", - " # Merge with annual totals\n", - " merged <- merge(yearly_max_block, admin_annual_data, by = year_column)\n", - " merged <- merged[ANNUAL_TOTAL > 0]\n", - " if (nrow(merged) == 0) return(NA_real_)\n", - " \n", - " # Proportion = block sum / annual total, then average across years\n", - " merged[, prop := max_block_sum / ANNUAL_TOTAL]\n", - " return(mean(merged$prop, na.rm = TRUE))\n", - "}\n", - "\n", - "seasonality_wide_dt[, (rain_proportion_col) := mapply(\n", - " compute_rain_proportion,\n", - " admin_id = get(admin_id_col),\n", - " block_duration = get(season_duration_col),\n", - " MoreArgs = list(row_data = row_seasonality_dt, annual_data = annual_totals_dt, admin_col = admin_id_col, year_column = year_col)\n", - ")]\n", - "\n", - "# Set RAIN_PROPORTION to NA for non-seasonal admin units\n", - "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (rain_proportion_col) := NA_real_]\n", - "\n", - "# Compute SEASONAL_BLOCK_START_MONTH: first month of the seasonal block\n", - "# Only for seasonal admin units (SEASONALITY_RAINFALL = 1)\n", - "\n", - "# Function to find the most frequent FIRST starting month for a given admin unit and block duration\n", - "# For each year, we find the FIRST month where the seasonal block starts, then take the mode\n", - "compute_start_month <- function(admin_id, block_duration, row_data, admin_col, year_column, month_column) {\n", - " if (is.na(block_duration) || is.infinite(block_duration)) return(NA_integer_)\n", - " \n", - " # Column with row-level seasonality indicator for this block duration\n", - " seasonality_row_col <- paste('RAINFALL', block_duration, 'MTH_ROW_SEASONALITY', sep = '_')\n", - " if (!seasonality_row_col %in% names(row_data)) return(NA_integer_)\n", - " \n", - " admin_row_data <- row_data[get(admin_col) == admin_id]\n", - " if (nrow(admin_row_data) == 0) return(NA_integer_)\n", - " \n", - " # For each YEAR, find the FIRST month where seasonality = 1\n", - " # This is the beginning of the seasonal block in that year\n", - " first_seasonal_months_by_year <- admin_row_data[\n", - " get(seasonality_row_col) == 1, \n", - " .(first_month = min(get(month_column))), \n", - " by = year_column\n", - " ]$first_month\n", - " \n", - " if (length(first_seasonal_months_by_year) == 0) return(NA_integer_)\n", - " \n", - " # Find the most frequent FIRST month across years (mode)\n", - " month_counts <- table(first_seasonal_months_by_year)\n", - " most_frequent_month <- as.integer(names(month_counts)[which.max(month_counts)])\n", - " \n", - " return(most_frequent_month)\n", - "}\n", - "\n", - "seasonality_wide_dt[, (season_start_month_col) := mapply(\n", - " compute_start_month,\n", - " admin_id = get(admin_id_col),\n", - " block_duration = get(season_duration_col),\n", - " MoreArgs = list(row_data = row_seasonality_dt, admin_col = admin_id_col, year_column = year_col, month_column = month_col)\n", - ")]\n", - "\n", - "# Set SEASONAL_BLOCK_START_MONTH to NA for non-seasonal admin units\n", - "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (season_start_month_col) := NA_integer_]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5ea18ef-1148-432d-a954-b9c6b4a06afc", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Preprocess input data" + ], + "id": "b8da71be-45f1-405c-857c-ed86984988f4" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# add remaining admin unit columns and save the final results\n", - "admin_seasonality_wide_dt <- merge.data.table(admin_data, seasonality_wide_dt, by = c(admin_id_col), all = TRUE)\n", - "admin_seasonality_wide_dt <- admin_seasonality_wide_dt[, .SD, .SDcols = c(common_cols, seasonality_cols)]\n", - "# head(admin_seasonality_wide_dt)" - ] - }, - { - "cell_type": "markdown", - "id": "a2f1a373-4b34-42db-b591-b25c7050dee6", - "metadata": {}, - "source": [ - "**Save output**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ded1e86d-f4c5-4fdb-a6ba-611d5c7f4aed", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# format table\n", + "setDT(original_dt)\n", + "integer_cols <- c(year_col, month_col)\n", + "numeric_cols <- c(original_values_col)\n", + "original_dt[, (integer_cols) := lapply(.SD, as.integer), .SDcols = integer_cols]\n", + "# head(original_dt)" + ], + "execution_count": null, + "outputs": [], + "id": "c5bf0faa-357e-44a7-af0c-04dd382af7e0" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create the filename\n", - "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'seasonality', sep = '_')\n", - "filename_csv = glue(\"{file_stem}.csv\")\n", - "filename_parquet = glue(\"{file_stem}.parquet\")\n", - "fwrite(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", - "write_parquet(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", - "log_msg(paste0(\"Rainfall seasonality results saved in folder \", OUTPUT_DATA_PATH))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9533db1c-a9db-42be-a4cd-4076d9f212ba", - "metadata": { - "vscode": { - "languageId": "r" + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# keep only the useful columns and aggregate the data on them\n", + "original_dt <- original_dt[,\n", + " setNames(list(sum(get(original_values_col), na.rm = TRUE)), original_values_col), \n", + " by = c(admin_id_col, period_cols)\n", + " ]\n", + "\n", + "num_periods <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[1]]\n", + "all_rows <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[2]]\n", + "\n", + "if (num_periods < minimum_periods){ \n", + " log_msg(glue(\"Data is not reliable: \n", + " at least {minimum_periods} year-month periods of data are required for the case analyais; \n", + " the data only contains {num_periods} periods. Abandoning analysis.\")\n", + " , level=\"error\")\n", + " stop(\"ERROR 1\")\n", + "}\n", + "\n", + "# inject the (possibly missing) rows into the data\n", + "original_dt <- make_full_time_space_data(\n", + " input_dt=original_dt,\n", + " full_rows_dt=all_rows,\n", + " target_colname=original_values_col,\n", + " admin_colname=admin_id_col,\n", + " year_colname=year_col,\n", + " month_colname=month_col)\n", + "\n", + "if(nrow(original_dt[is.na(get(original_values_col)),]) > (maximum_proportion_missings_overall * nrow(original_dt))){ \n", + " log_msg(\"There are too many missing values in the data overall. Abandoning analysis.\", level=\"error\")\n", + " stop(\"ERROR 2\") \n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "a1a762ad-943e-467b-8cc1-e4998a996b9f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Imputation of missings" + ], + "id": "e3d793a5-ac96-4dcc-bd86-5837a631ea54" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Remove impute files (if any)**" + ], + "id": "1b7c767b-5343-4d7a-ad6a-aac11ee2ba12" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Remove existing imputation files\n", + "filename_imputed_dt <- paste(COUNTRY_CODE, type_of_seasonality, 'imputed.csv', sep = '_')\n", + "files_in_folder <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "files_to_remove <- files_in_folder[grepl(filename_imputed_dt, basename(files_in_folder), ignore.case = TRUE)]\n", + "file.remove(files_to_remove)\n", + "print(glue(\"Deleted files: {str(files_to_remove)}\"))" + ], + "execution_count": null, + "outputs": [], + "id": "ac3414c6-baf1-47f0-ad6d-5ff1cb0e432e" + }, + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# create the name of the column which will store the imputed/estimated values\n", + "imputed_col = paste(original_values_col, 'EST', sep = '_')\n", + "\n", + "# if there are rows of missing data for cases, impute them (SARIMA)\n", + "if(nrow(original_dt[!is.na(get(original_values_col)),]) != nrow(original_dt)) {\n", + " log_msg(\"There is missing data. Proceeding to impute them.\", level=\"warning\")\n", + " \n", + " # extract data on only the administrative units which have missing values for original_values_col\n", + " missing_dt <- extract_dt_with_missings(original_dt, target_colname = original_values_col, id_colname = admin_id_col)\n", + " missing_dt <- missing_dt[, PERIOD := make_yearmonth(year = YEAR, month = MONTH)]\n", + " missing_dt <- missing_dt[, .SD, .SDcols = c(admin_id_col, 'PERIOD', original_values_col)]\n", + " \n", + " # how many rows missing for each administrative unit? if too many, then not good idea to impute\n", + " missings_by_admin_unit <- missing_dt[, .(missing_count = sum(is.na(get(original_values_col)))), by = admin_id_col][order(-missing_count)]\n", + " \n", + " # if for any given admin unit, more than a given % of data is missing, there's too much to impute (maybe should be stricter - to discuss)\n", + " if(missings_by_admin_unit[, max(missing_count)] > maximum_proportion_missings_per_district * num_periods){\n", + " log_msg(\"Some administrative units have too many missing values in the target data. Abandoning analysis.\", level=\"error\")\n", + " stop(\"ERROR 3\")\n", + " }\n", + " \n", + " # split to list per admin_unit_id, to apply SARIMA imputation on each time series (per admin unit)\n", + " missing_districts_list <- split(missing_dt, by = admin_id_col)\n", + " \n", + " # seasonal ARIMA to estimate missing cases: apply function to list of data.tables with missing rows, then create data.table from result\n", + " filled_missings_dt <- rbindlist(\n", + " lapply(missing_districts_list,\n", + " fill_missing_cases_ts,\n", + " original_values_colname=original_values_col,\n", + " estimated_values_colname=imputed_col,\n", + " admin_colname=admin_id_col,\n", + " period_colname='PERIOD',\n", + " threshold_for_missing = 0.0)\n", + " )\n", + " \n", + " # add the imputed (\"_EST\") values to the original data\n", + " imputed_dt <- merge.data.table(original_dt, filled_missings_dt[, .SD, .SDcols = !(original_values_col)], by = c(admin_id_col, year_col, month_col), all.x = TRUE)\n", + " \n", + " # copy from the districts without missings;\n", + " # if data is large, this could be made faster by only copying from the districts which are not in the missing_dt\n", + " imputed_dt[!is.na(get(original_values_col)), (imputed_col) := get(original_values_col)]\n", + "\n", + " # Save imputed file, only if it was computed (if there is missing data to impute)\n", + " safe_create_dir(INTERMEDIATE_RESULTS_PATH)\n", + " fwrite(imputed_dt, file = file.path(INTERMEDIATE_RESULTS_PATH, filename_imputed_dt))\n", + " \n", + "} else {\n", + " imputed_dt <- copy(original_dt)\n", + " imputed_dt[, (imputed_col) := get(original_values_col)]\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "8cf34a4f-f429-4ee5-9919-5e5c7abe9da6" + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Seasonality" + ], + "id": "9db44942-d844-491c-9045-906e99a37c60" + }, + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Step 1: Compute month-level seasonality indicators\n", + "# For each row (period-admin unit), determine if it marks the start of a seasonal block\n", + "\n", + "row_seasonality_dt <- compute_month_seasonality(\n", + " input_dt=imputed_dt,\n", + " indicator=type_of_seasonality,\n", + " values_colname=imputed_col,\n", + " vector_of_durations=possible_month_block_sizes,\n", + " admin_colname=admin_id_col,\n", + " year_colname=year_col,\n", + " month_colname=month_col,\n", + " proportion_threshold=threshold_for_seasonality,\n", + " use_calendar_year_denominator=use_calendar_year_denominator\n", + ")\n", + "\n", + "# Create the filename\n", + "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'row_seasonality', sep = '_')\n", + "filename_csv = glue(\"{file_stem}.csv\")\n", + "filename_parquet = glue(\"{file_stem}.parquet\")\n", + "fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", + "write_parquet(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", + "\n", + "# The seasonality per admin unit, irrespective of year ----------------------\n", + "\n", + "seasonality_source_dt <- process_seasonality(\n", + " input_dt=row_seasonality_dt,\n", + " indicator=type_of_seasonality,\n", + " vector_of_durations=possible_month_block_sizes,\n", + " admin_colname=admin_id_col,\n", + " year_colname=year_col,\n", + " month_colname=month_col,\n", + " proportion_seasonal_years_threshold=threshold_proportion_seasonal_years\n", + ")\n", + "\n", + "# Compute the duration block; there are normal warnings when it's only 0-es for seasonality:\n", + "# for those admin units without any seasonality, the duration of the block will be 'infinite')\n", + "check_pattern_seasonality <- paste(\"^SEASONALITY\", toupper(type_of_seasonality), \"[0-9]+_MTH$\", sep = \"_\")\n", + "seasonality_source_dt <- seasonality_source_dt[, .SD, .SDcols = c(admin_id_col, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]" + ], + "execution_count": null, + "outputs": [], + "id": "9bfa3ebf-3a04-405a-9cc6-5f174a08f70b" + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Result file" + ], + "id": "d9f8270f-7283-4630-b9ba-62366b1c3e62" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### long" + ], + "id": "477fb459-0f98-4a32-96ab-f10b4395495f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This format, until further notice, is not saved." + ], + "id": "db719aed-6347-48f4-8984-add9f8adec2d" + }, + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "seasonality_long_dt <- melt(\n", + " seasonality_source_dt,\n", + " id.vars = grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE, invert = TRUE), # all cols which don't follow the pattern\n", + " variable.name = 'MONTH_BLOCK_SIZE',\n", + " value.name =seasonality_col\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "50ba68f5-e9a9-4144-99dc-de8cc44770e9" + }, + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "seasonality_long_dt[, MONTH_BLOCK_SIZE := possible_month_block_sizes[match(MONTH_BLOCK_SIZE, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]]\n", + "\n", + "# add remaining admin unit columns and save the final results\n", + "admin_seasonality_long_dt <- merge.data.table(admin_data, seasonality_long_dt, by = c(admin_id_col), all = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "b2a400d4-a545-40ab-9204-b6b2e69ea499" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# order the columns\n", + "specific_cols <- setdiff(names(admin_seasonality_long_dt), names(admin_data)) # last columns\n", + "admin_seasonality_long_dt <- admin_seasonality_long_dt[, .SD, .SDcols = c(common_cols, specific_cols)]" + ], + "execution_count": null, + "outputs": [], + "id": "7a7926d5-d4e7-4707-85b8-7020c3738be3" + }, + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Keeping for now.\n", + "# # filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", + "# filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", + "# fwrite(admin_seasonality_long_dt, file.path(OUTPUT_DATA_PATH, filename_admin_seasonality_long_dt))" + ], + "execution_count": null, + "outputs": [], + "id": "343a07c2-2e0e-49eb-b487-cc8d6431e015" + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Transform to wide format" + ], + "id": "36d1f9cb-75b6-4f6a-a18c-b34eb233b8d2" + }, + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "seasonality_wide_dt <- compute_min_seasonality_block(\n", + " input_dt=seasonality_source_dt,\n", + " seasonality_column_pattern=check_pattern_seasonality,\n", + " vector_of_possible_month_block_sizes=possible_month_block_sizes,\n", + " seasonal_blocksize_colname=season_duration_col,\n", + " valid_value = 1\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "e71259c9-29a6-452f-8949-74adb0e62c1c" + }, + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Create a new, overall column 'SEASONALITY_' based on the values of columns in 'check_pattern_seasonality'\n", + "seasonality_pattern_cols <- grep(check_pattern_seasonality, names(seasonality_wide_dt), value = TRUE)\n", + "if (length(seasonality_pattern_cols) > 0L) {\n", + " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_col) := ifelse(rowSums(.SD == 1, na.rm = TRUE) > 0, 1L, 0L), .SDcols = seasonality_pattern_cols]\n", + " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_pattern_cols) := NULL]\n", + "} else {\n", + " seasonality_wide_dt[, (seasonality_col) := NA_integer_]\n", + "}\n", + "\n", + "# Compute RAIN_PROPORTION: proportion of rainfall in the seasonal block vs ANNUAL total\n", + "# Only for seasonal admin units (SEASONALITY_RAINFALL = 1)\n", + "annual_totals_dt <- imputed_dt[, .(ANNUAL_TOTAL = sum(get(imputed_col), na.rm = TRUE)), by = c(admin_id_col, year_col)]\n", + "\n", + "seasonality_wide_dt[, (rain_proportion_col) := mapply(\n", + " compute_rain_proportion,\n", + " admin_id = get(admin_id_col),\n", + " block_duration = get(season_duration_col),\n", + " MoreArgs = list(\n", + " row_data = row_seasonality_dt,\n", + " annual_data = annual_totals_dt,\n", + " admin_col = admin_id_col,\n", + " year_column = year_col\n", + " )\n", + ")]\n", + "\n", + "# Set RAIN_PROPORTION to NA for non-seasonal admin units\n", + "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (rain_proportion_col) := NA_real_]\n", + "\n", + "# Compute SEASONAL_BLOCK_START_MONTH: first month of the seasonal block\n", + "# Only for seasonal admin units (SEASONALITY_RAINFALL = 1)\n", + "seasonality_wide_dt[, (season_start_month_col) := mapply(\n", + " compute_start_month,\n", + " admin_id = get(admin_id_col),\n", + " block_duration = get(season_duration_col),\n", + " MoreArgs = list(\n", + " row_data = row_seasonality_dt,\n", + " admin_col = admin_id_col,\n", + " year_column = year_col,\n", + " month_column = month_col\n", + " )\n", + ")]\n", + "\n", + "# Set SEASONAL_BLOCK_START_MONTH to NA for non-seasonal admin units\n", + "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (season_start_month_col) := NA_integer_]" + ], + "execution_count": null, + "outputs": [], + "id": "9284c723-97d4-46f9-8837-7f70dae92a31" + }, + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# add remaining admin unit columns and save the final results\n", + "admin_seasonality_wide_dt <- merge.data.table(admin_data, seasonality_wide_dt, by = c(admin_id_col), all = TRUE)\n", + "admin_seasonality_wide_dt <- admin_seasonality_wide_dt[, .SD, .SDcols = c(common_cols, seasonality_cols)]\n", + "# head(admin_seasonality_wide_dt)" + ], + "execution_count": null, + "outputs": [], + "id": "c5ea18ef-1148-432d-a954-b9c6b4a06afc" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Save output**" + ], + "id": "a2f1a373-4b34-42db-b591-b25c7050dee6" + }, + { + "cell_type": "code", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Create the filename\n", + "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'seasonality', sep = '_')\n", + "filename_csv = glue(\"{file_stem}.csv\")\n", + "filename_parquet = glue(\"{file_stem}.parquet\")\n", + "fwrite(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", + "write_parquet(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", + "log_msg(paste0(\"Rainfall seasonality results saved in folder \", OUTPUT_DATA_PATH))" + ], + "execution_count": null, + "outputs": [], + "id": "ded1e86d-f4c5-4fdb-a6ba-611d5c7f4aed" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, \"row_seasonality.csv\"))" + ], + "execution_count": null, + "outputs": [], + "id": "9533db1c-a9db-42be-a4cd-4076d9f212ba" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# fwrite(seasonality_source_dt, file.path(OUTPUT_DATA_PATH, \"processed_seasonality.csv\"))" + ], + "execution_count": null, + "outputs": [], + "id": "4b39e53f-f4fe-49e4-8e4d-9a6679313a54" } - }, - "outputs": [], - "source": [ - "# fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, \"row_seasonality.csv\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b39e53f-f4fe-49e4-8e4d-9a6679313a54", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "# fwrite(seasonality_source_dt, file.path(OUTPUT_DATA_PATH, \"processed_seasonality.csv\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_seasonality_rainfall/utils/snt_seasonality_rainfall.r b/pipelines/snt_seasonality_rainfall/utils/snt_seasonality_rainfall.r new file mode 100644 index 0000000..0665e80 --- /dev/null +++ b/pipelines/snt_seasonality_rainfall/utils/snt_seasonality_rainfall.r @@ -0,0 +1,66 @@ +compute_rain_proportion <- function(admin_id, block_duration, row_data, annual_data, admin_col, year_column) { + if (is.na(block_duration) || is.infinite(block_duration)) { + return(NA_real_) + } + + sum_col <- paste("RAINFALL_SUM", block_duration, "MTH_FW", sep = "_") + if (!sum_col %in% names(row_data)) { + return(NA_real_) + } + + admin_row_data <- row_data[get(admin_col) == admin_id] + admin_annual_data <- annual_data[get(admin_col) == admin_id] + if (nrow(admin_row_data) == 0 || nrow(admin_annual_data) == 0) { + return(NA_real_) + } + + yearly_max_block <- admin_row_data[ + !is.na(get(sum_col)), + .(max_block_sum = if (.N > 0L) max(get(sum_col), na.rm = TRUE) else NA_real_), + by = year_column + ] + + yearly_max_block <- yearly_max_block[is.finite(max_block_sum)] + if (nrow(yearly_max_block) == 0) { + return(NA_real_) + } + + merged <- merge(yearly_max_block, admin_annual_data, by = year_column) + merged <- merged[ANNUAL_TOTAL > 0] + if (nrow(merged) == 0) { + return(NA_real_) + } + + merged[, prop := max_block_sum / ANNUAL_TOTAL] + mean(merged$prop, na.rm = TRUE) +} + + +compute_start_month <- function(admin_id, block_duration, row_data, admin_col, year_column, month_column) { + if (is.na(block_duration) || is.infinite(block_duration)) { + return(NA_integer_) + } + + seasonality_row_col <- paste("RAINFALL", block_duration, "MTH_ROW_SEASONALITY", sep = "_") + if (!seasonality_row_col %in% names(row_data)) { + return(NA_integer_) + } + + admin_row_data <- row_data[get(admin_col) == admin_id] + if (nrow(admin_row_data) == 0) { + return(NA_integer_) + } + + first_seasonal_months_by_year <- admin_row_data[ + get(seasonality_row_col) == 1, + .(first_month = min(get(month_column))), + by = year_column + ]$first_month + + if (length(first_seasonal_months_by_year) == 0) { + return(NA_integer_) + } + + month_counts <- table(first_seasonal_months_by_year) + as.integer(names(month_counts)[which.max(month_counts)]) +} From eca709c645db04edb0a469a0e6280f81419d83cb Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 31 Mar 2026 16:37:43 +0200 Subject: [PATCH 20/23] final --- .../code/snt_dhs_bednets_computation.ipynb | 2253 +++++++++-------- .../snt_dhs_careseeking_computation.ipynb | 1152 ++++----- .../code/snt_dhs_prevalence_computation.ipynb | 921 +++---- .../snt_dhs_vaccination_computation.ipynb | 1249 ++++----- .../snt_dhs_careseeking_report.ipynb | 632 ++--- .../utils/snt_dhs_careseeking_computation.r | 71 + .../utils/snt_dhs_indicator_tables.r | 174 ++ .../code/snt_healthcare_access.ipynb | 104 +- .../utils/snt_healthcare_access.r | 90 + .../reporting/snt_map_extracts_report.ipynb | 407 +-- .../utils/snt_map_extracts_report.r | 22 + .../code/snt_seasonality_rainfall.ipynb | 310 +-- .../snt_worldpop_extract_report.ipynb | 1049 ++++---- .../utils/snt_worldpop_extract_report.r | 56 + snt_dhs_indicators/pipeline.py | 3 + 15 files changed, 4591 insertions(+), 3902 deletions(-) create mode 100644 pipelines/snt_dhs_indicators/utils/snt_dhs_careseeking_computation.r create mode 100644 pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r create mode 100644 pipelines/snt_healthcare_access/utils/snt_healthcare_access.r create mode 100644 pipelines/snt_map_extracts/utils/snt_map_extracts_report.r create mode 100644 pipelines/snt_worldpop_extract/utils/snt_worldpop_extract_report.r diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb index bea665f..f2fdb2e 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb @@ -1,1032 +1,1225 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "8ba79c20-9f47-4d61-93ab-19d3802125ec", - "metadata": {}, - "source": [ - "# Insecticide-teated net (ITN) access and use, DHS data" - ] - }, - { - "cell_type": "markdown", - "id": "d5d9b645-2094-4b60-a9b2-b89ba33ac4dc", - "metadata": {}, - "source": [ - "## Resources" - ] - }, - { - "cell_type": "markdown", - "id": "90ddd8fd-ec6b-4a29-b78d-8010cdc4d40e", - "metadata": {}, - "source": [ - "https://dhsprogram.com/data/Guide-to-DHS-Statistics/Access_to_an_Insecticide-Treated_Net_ITN.htm\n", - "\n", - "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Use_of_Mosquito_Nets_by_Persons_in_the_Household.htm%23Percentage_of_the1bc-1&rhtocid=_15_3_0\n", - "\n", - "https://dhsprogram.com/publications/publication-dhsg4-dhs-questionnaires-and-manuals.cfm" - ] - }, - { - "cell_type": "markdown", - "id": "53012a66-e2c7-4ecb-9233-27581e349368", - "metadata": {}, - "source": [ - "### Access" - ] - }, - { - "cell_type": "markdown", - "id": "64d4e878-f065-47cc-be00-2b0cd3baeacc", - "metadata": {}, - "source": [ - "Percentage of the de facto household population with access to an ITN in the household, defined as the proportion of the de facto household population who slept under an ITN if each ITN in the household were used by up to two people.\n", - "\n", - "Numerator: Number of de facto persons (hv103 = 1) who could sleep under an ITN if each ITN in the household is used by up to 2 people, calculated for each household as the minimum of:\n", - "\n", - "1. number of de facto persons in the household (hv013), and\n", - "2. twice the number of ITNs per household (2 * sum of hml10_1 – hml10_7 = 1) <- assumed that maximum two people can sleep under a bednet\n", - " \n", - "Denominator: Number of persons who stayed in the household the night before the survey (hv103 = 1)\n", - "\n", - "Variables: hhid (household identification), hml10_1 – _7 (Insecticide-Treated Net (ITN)), hv013 (Number of de facto members) hv103, (Slept last night), hv005 (Household sample)" - ] - }, - { - "cell_type": "markdown", - "id": "5bd2650b-e952-45d1-b46a-b08b777a5961", - "metadata": {}, - "source": [ - "### Use" - ] - }, - { - "cell_type": "markdown", - "id": "327fe0a1-1be5-4d35-a6ff-8913ad56b6c3", - "metadata": {}, - "source": [ - "1) Percentage of the de facto household population who slept the night before the survey under a mosquito net (treated or untreated).\n", - "\n", - "2) Percentage of the de facto household population who slept the night before the survey under an insecticide-treated net (ITN).\n", - "\n", - "3) Among the de facto household population in households with at least one ITN, the percentage who slept under an ITN the night before the survey.\n", - "\n", - "Coverage:\n", - "Population base: De facto household members (PR file, HR file)\n", - "Time period: Night before the survey\n", - "\n", - "Numerators:\n", - "1) Number of de facto persons who reported sleeping under any mosquito net the night before the survey (hv103 = 1 & hml12 in 1:3)\n", - "2) Number of de facto persons who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2)\n", - "3) Number of de facto persons in households with at least one ITN who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2 & any hml10_1 – hml10_7 = 1)\n", - "\n", - "Denominators:\n", - "a) Number of persons in the de facto household population (hv103 = 1)\n", - "b) Number of persons in the de facto household population (hv103 = 1)\n", - "c) Number of persons in the de facto household population in households owning at least one ITN (hv103 = 1 & any hml10_1 – hml10_7 = 1)\n", - "\n", - "Variables: HR file, PR file.\n", - "\n", - "\n", - "**Project uses numerator 2) Number of de facto persons who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2)**\n", - "\n", - "**Project uses denominator b) Number of persons in the de facto household population (hv103 = 1)**" - ] - }, - { - "cell_type": "markdown", - "id": "3b050280-b234-45f1-bac5-8e6910079118", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3632c310-6a58-4825-8b80-ce3612b6caca", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb98532b-56c9-42c7-9bbd-a6f7869bfc76", - "metadata": {}, - "outputs": [], - "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebf120a6-1559-4295-93c4-cfbfd141a67b", - "metadata": {}, - "outputs": [], - "source": [ - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'bednets')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc24cdd4-2ccb-4511-8a63-8ee4b0c29bde", - "metadata": {}, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7acc95aa-eb5e-421e-b23e-0efa602d1cc1", - "metadata": {}, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "markdown", - "id": "93898419-b98d-4a53-8fc9-a1bb9bff01a4", - "metadata": {}, - "source": [ - "## Geo and admin data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "145fe721-f42a-45ff-a3cb-060886fe7a9e", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4938a133-569e-4aec-bf55-7a633a142bc2", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset \n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31c87079-f8b6-417a-a206-804d5c3208e8", - "metadata": {}, - "outputs": [], - "source": [ - "spatial_data <- st_as_sf(spatial_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98c0e39a-5433-4dc8-a109-f279f7be0271", - "metadata": {}, - "outputs": [], - "source": [ - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}\n", - "\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)" - ] - }, - { - "cell_type": "markdown", - "id": "6399c2eb-9509-4b4f-839a-6c8c83004510", - "metadata": {}, - "source": [ - "## Import DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0fb15b9d-3cc5-4c6f-be25-124169388c25", - "metadata": {}, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "indicator_access <- 'PCT_ITN_ACCESS'\n", - "indicator_use <- 'PCT_ITN_USE'" - ] - }, - { - "cell_type": "markdown", - "id": "0de3a133-4873-43ad-8a33-ed6afa42330b", - "metadata": {}, - "source": [ - "### Unzip data for the analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b88b59b-383c-49a6-b476-9319042243e2", - "metadata": {}, - "outputs": [], - "source": [ - "household_recode <- 'HR'\n", - "person_recode <- 'PR'\n", - "target_file_type <- 'SV'\n", - "\n", - "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf5e0839-612a-430b-ad22-14bc62d6cad5", - "metadata": {}, - "outputs": [], - "source": [ - "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "dhs_pr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, person_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_pr_zip_filename), exdir=DHS_DATA_PATH)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca6e3da7-916c-47e1-b475-2bd5cf551bcd", - "metadata": {}, - "outputs": [], - "source": [ - "# # Remove existing output files\n", - "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "# files_to_delete <- files[grepl('_ITN_', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", - "# file.remove(files_to_delete)" - ] - }, - { - "cell_type": "markdown", - "id": "02d577f1-007e-40b2-a3f6-e5b41089ee4a", - "metadata": {}, - "source": [ - "### Import data files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "930ef55c-a590-4d76-bcad-902528f8815a", - "metadata": {}, - "outputs": [], - "source": [ - "data_extension <- '.SAV'\n", - "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "dhs_pr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", person_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "\n", - "if(!check_dhs_same_version(dhs_hr_filename, dhs_pr_filename)){\n", - " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60283f74-b656-4596-82ff-aefa487ddd28", - "metadata": {}, - "outputs": [], - "source": [ - "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", - "dhs_hr_dt <- setDT(dhs_hr_dt)\n", - "\n", - "dhs_pr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_pr_filename)) # person recode\n", - "dhs_pr_dt <- setDT(dhs_pr_dt)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9c47b679-8735-4f15-bac5-6efcf912df86", - "metadata": {}, - "outputs": [], - "source": [ - "# Make admin codes and names dataframe (for future merging)\n", - "\n", - "dhs_admin_dt <- make_dhs_admin_df(\n", - " input_dhs_df=dhs_hr_dt,\n", - " original_admin_column=\"HV024\",\n", - " new_admin_name_colname=admin_name_col,\n", - " new_admin_code_colname='DHS_ADM1_CODE'\n", - ")\n", - "\n", - "# format the names to be like DHIS2 names\n", - "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", - "\n", - "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", - "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2727c88e-45fb-4011-9fe8-99b55b8313e8", - "metadata": {}, - "outputs": [], - "source": [ - "# Check that all regions can be matched with DHIS2 pyramid\n", - "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", - " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "659a2a79-a563-4062-b7b4-25652e140c5c", - "metadata": {}, - "source": [ - "### Set relevant columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fe61ebf-11b1-41ee-beb5-a553fffc3015", - "metadata": {}, - "outputs": [], - "source": [ - "household_id_cols <- c(\"HHID\", \"HV000\", \"HV001\", \"HV002\")\n", - "original_household_ITN_cols <- grep('HML10', names(dhs_hr_dt), value = TRUE)\n", - "household_sampling_cols <- c(\"HV005\", \"HV021\", \"HV022\", \"HV023\", \"HV024\")\n", - "household_inhabitants_col <- \"HV013\"\n", - "person_slept_col <- \"HV103\"\n", - "person_id_col <- \"HVIDX\"\n", - "person_bednet_col <- \"HML12\"" - ] - }, - { - "cell_type": "markdown", - "id": "8e71933c-df9f-413a-93eb-38f6e457ce7c", - "metadata": {}, - "source": [ - "## Preprocess Household recode data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72d8d33a-6163-493b-87b9-11e82eae5bf7", - "metadata": {}, - "outputs": [], - "source": [ - "# filter columns\n", - "hr_dt <- dhs_hr_dt[, .SD, .SDcols=c(household_id_cols, household_sampling_cols, household_inhabitants_col, original_household_ITN_cols)]\n", - "\n", - "# check i didn't omit any crucial variable\n", - "nrow(hr_dt[duplicated(hr_dt)])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58bec3c3-9c58-4ae9-8939-af545160239d", - "metadata": {}, - "outputs": [], - "source": [ - "sapply(original_household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "436a4f7b-b3a3-4849-8b4a-7205ffb64efd", - "metadata": {}, - "outputs": [], - "source": [ - "# make syntactically valid names\n", - "setnames(hr_dt, old = names(hr_dt), new = make.names(names(hr_dt)))\n", - "household_ITN_cols <- grep('HML10', names(hr_dt), value = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6508361d-f908-41fc-9a6b-ef709fcf24cd", - "metadata": {}, - "outputs": [], - "source": [ - "sapply(household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b89fdf1e-059a-4a39-be91-f7722c18824d", - "metadata": {}, - "outputs": [], - "source": [ - "# add admin name column\n", - "hr_dt <- merge.data.table(dhs_admin_dt, hr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"HV024\", all = TRUE)\n", - "\n", - "# sapply(household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))\n", - "\n", - "hr_dt[, (household_ITN_cols) := lapply(.SD, function(x) {\n", - " x <- as.integer(as.character(x)) # convert factors/characters to numeric\n", - " ifelse(is.na(x), 0, x)\n", - "}), .SDcols = household_ITN_cols]\n", - "\n", - "# compute the maximum potential users, given the number of ITNs present in the household\n", - "hr_dt[, max_users := 2 * rowSums(.SD, na.rm = TRUE), .SDcols = household_ITN_cols] # maximum 2 times the number of ITNs in the household\n", - "\n", - "# compute real potential users\n", - "hr_dt[, potential_users := pmin(max_users, HV013, na.rm = TRUE)]\n", - "\n", - "# compute weights\n", - "hr_dt[, wt := HV005/1000000]" - ] - }, - { - "cell_type": "markdown", - "id": "a17d5a0b-de6c-4e06-8567-e03a37936d65", - "metadata": {}, - "source": [ - "## Access to ITN" - ] - }, - { - "cell_type": "markdown", - "id": "c8d3241b-3f42-44fd-9cbc-1c5d3f7d877c", - "metadata": {}, - "source": [ - "### Preprocess person file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6aa78b5e-f5a3-492e-8859-8757fabc6e78", - "metadata": {}, - "outputs": [], - "source": [ - "# filter relevant columns\n", - "access_pr_dt <- dhs_pr_dt[, .SD, .SDcols = c(\n", - " household_id_cols,\n", - " person_id_col,\n", - " person_slept_col\n", - ")]\n", - "\n", - "# # check no necessary column was omitted\n", - "# nrow(access_pr_dt[duplicated(access_pr_dt)])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9eb38a46-4c62-4ed8-b33b-e4cb9b7508b3", - "metadata": {}, - "outputs": [], - "source": [ - "# make denominator: group and sum, removing NAs\n", - "access_pr_dt <- access_pr_dt[, .(total_slept = sum(get(person_slept_col), na.rm = TRUE)), by = household_id_cols]" - ] - }, - { - "cell_type": "markdown", - "id": "149e766d-0863-4c74-b12e-26727e940a8c", - "metadata": {}, - "source": [ - "### Join with household file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af6a7e45-18b1-4fbf-af8b-c57886920780", - "metadata": {}, - "outputs": [], - "source": [ - "# check merge with household file\n", - "check_perfect_match(hr_dt, 'HHID', access_pr_dt, 'HHID')\n", - "\n", - "# lapply(household_id_cols, function(i) check_perfect_match(hr_dt, i, access_pr_dt, i))\n", - "if(!all(unlist((lapply(household_id_cols, function(i) check_perfect_match(hr_dt, i, access_pr_dt, i)))))){\n", - " print('Person and Household data does not match')\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02c1997c-f5c6-4eee-b889-a8c8cc81dd0f", - "metadata": {}, - "outputs": [], - "source": [ - "access_dt <- merge.data.table(hr_dt, access_pr_dt, by = household_id_cols, all = TRUE)\n", - "\n", - "# filter rows\n", - "access_dt <- access_dt[total_slept > 0] # to not divide by 0 (only households where someone slept last night)" - ] - }, - { - "cell_type": "markdown", - "id": "50e7c498-1141-46a4-b4fc-b5c4617fb956", - "metadata": {}, - "source": [ - "DHS guidelines for the calculation of “potential users”: \"In households which have more than 1 ITN for every 2 people, the product of this calculation will be greater than the number of individuals who spent the previous night. In this case, the “potential users” variable in that household should be modified to reflect the number of individuals who spent the previous night in the household because the number of potential users in a household cannot exceed the number of individuals who spent the previous night in that household.\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b20ff38-d658-43c8-83d5-5357bc7a293a", - "metadata": {}, - "outputs": [], - "source": [ - "access_dt[, foo := fifelse(\n", - " potential_users > total_slept,\n", - " total_slept,\n", - " potential_users\n", - ")]" - ] - }, - { - "cell_type": "markdown", - "id": "fddd66b0-c351-4aa4-bdaa-47542fbdee9d", - "metadata": {}, - "source": [ - "### Compute ITN access indicator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "caa18abf-6589-4846-a705-897ce3943692", - "metadata": {}, - "outputs": [], - "source": [ - "access_dt[, (indicator_access) := potential_users / total_slept]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ff9c48c-1c35-4e14-a4bf-417040e9e4f7", - "metadata": {}, - "outputs": [], - "source": [ - "summary(access_dt[[indicator_access]])" - ] - }, - { - "cell_type": "markdown", - "id": "9c53f039-299a-4c02-a728-77a77d088472", - "metadata": {}, - "source": [ - "#### Account for the sampling strategy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9df627ce-0424-4341-9fcd-b1f13cd769b6", - "metadata": {}, - "outputs": [], - "source": [ - "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", - "access_design_sampling = svydesign(\n", - " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", - " data = access_dt, # dataset\n", - " strata = ~ HV023, # groupings of primary sampling units\n", - " weights = ~ wt, # the sampling weights variable\n", - " num_p=1, # ? dunno what this is\n", - " nest = T # the primary sampling units are nested within the strata\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f699992f-e931-4657-abf3-7a2e13a89dbf", - "metadata": {}, - "outputs": [], - "source": [ - "bednet_access_table <- svyby(formula = as.formula(paste(\"~\", indicator_access)),\n", - " # by = ~ ADM1,\n", - " by = reformulate(admin_name_col),\n", - " FUN = svymean,\n", - " design = access_design_sampling,\n", - " level = 0.95,\n", - " vartype = \"ci\",\n", - " na.rm = TRUE,\n", - " influence = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bfc79978-0f59-444b-a8f1-3420864489b1", - "metadata": {}, - "outputs": [], - "source": [ - "setDT(bednet_access_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e71c083-0b67-43db-8f0b-eda14d8a2be3", - "metadata": {}, - "outputs": [], - "source": [ - "lower_bound_col <- glue(\"{toupper(indicator_access)}_CI_LOWER_BOUND\")\n", - "upper_bound_col <- glue(\"{toupper(indicator_access)}_CI_UPPER_BOUND\")\n", - "sample_avg_col <- glue(\"{toupper(indicator_access)}_SAMPLE_AVERAGE\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c92b8b6-d343-4a69-b508-ae23ebbae48b", - "metadata": {}, - "outputs": [], - "source": [ - "# names(bednet_access_table) <- toupper(names(bednet_access_table))\n", - "names(bednet_access_table)[names(bednet_access_table) == 'ci_l'] <- lower_bound_col\n", - "names(bednet_access_table)[names(bednet_access_table) == 'ci_u'] <- upper_bound_col\n", - "names(bednet_access_table)[names(bednet_access_table) == indicator_access] <- sample_avg_col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98542095-2e06-4860-b038-c7eeceeb6265", - "metadata": {}, - "outputs": [], - "source": [ - "# Cap the CI's between 0 and 1 (in case of small sample => large CI's)\n", - "bednet_access_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - "bednet_access_table[get(upper_bound_col) > 1, (upper_bound_col) := 1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75e920e5-3715-4971-bc6c-62291dc59fc5", - "metadata": {}, - "outputs": [], - "source": [ - "# Convert to percentages\n", - "bednet_access_table[, (lower_bound_col) := get(lower_bound_col) * 100]\n", - "bednet_access_table[, (upper_bound_col) := get(upper_bound_col) * 100]\n", - "bednet_access_table[, (sample_avg_col) := get(sample_avg_col) * 100]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67cb29d2-ee18-4d28-9c72-e253f8dcb5b6", - "metadata": {}, - "outputs": [], - "source": [ - "bednet_access_table <- merge.data.table(admin_data, bednet_access_table, by = admin_name_col, all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54935f07-b254-458d-b1fb-8c7a921b5bd9", - "metadata": {}, - "outputs": [], - "source": [ - "head(bednet_access_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85c1bf55-39d6-4c70-88bf-46542f158b0c", - "metadata": {}, - "outputs": [], - "source": [ - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}\")\n", - "write.csv(bednet_access_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", - "write_parquet(bednet_access_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))" - ] - }, - { - "cell_type": "markdown", - "id": "484e0a38-8c37-4213-a750-babfaf9107bc", - "metadata": {}, - "source": [ - "## ITN use" - ] - }, - { - "cell_type": "markdown", - "id": "1ebdff14-edbd-4fd5-85f5-230b1e27adb7", - "metadata": {}, - "source": [ - "### Preprocess person file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a3536b2-6fc4-412f-91bc-21466b688e59", - "metadata": {}, - "outputs": [], - "source": [ - "# filter columns\n", - "use_pr_dt <- dhs_pr_dt[, .SD, .SDcols=c(household_id_cols, person_id_col, person_slept_col, person_bednet_col)]\n", - "\n", - "# check no necessary column was omitted\n", - "nrow(use_pr_dt[duplicated(use_pr_dt)])\n", - "\n", - "# # for(i in person_slept_col){print(table(access_pr_dt[[i]]))}\n", - "# sapply(person_bednet_col, function(i) table(use_pr_dt[[i]], useNA = 'always'))" - ] - }, - { - "cell_type": "markdown", - "id": "a303e6ec-a3ac-4a20-8f3b-40a06d8067e3", - "metadata": {}, - "source": [ - "The DHS guide ( https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Use_of_Mosquito_Nets_by_Persons_in_the_Household.htm ) suggests to use both 1 & 2 as possible values for HML12; but 2 is \"Both treated (ITN) and untreated nets\"; using as specified in the guide, but to be kept in mind" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35211f4e-e84d-4fbc-be01-577912300d29", - "metadata": {}, - "outputs": [], - "source": [ - "# # group and sum, removing NAs and keeping only 1 as valid value\n", - "# use_pr_dt <- use_pr_dt[, slept_itn := as.integer(\n", - "# get(person_slept_col) == 1 & (get(person_bednet_col) == 1)\n", - "# )]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "132e025d-a699-4c0c-a20d-d4f7503811a6", - "metadata": {}, - "outputs": [], - "source": [ - "# group and sum, removing NAs\n", - "use_pr_dt <- use_pr_dt[, slept_itn := as.integer(\n", - " get(person_slept_col) == 1 & (get(person_bednet_col) %in% c(1, 2))\n", - ")]\n", - "\n", - "# check recodings are correct\n", - "xtabs(~ get(person_slept_col) + get(person_bednet_col) + slept_itn, data = use_pr_dt, addNA = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c883b64-605c-4825-b16f-5d4b2f6febf9", - "metadata": {}, - "outputs": [], - "source": [ - "use_pr_dt <- use_pr_dt[, .(\n", - " total_slept = sum(get(person_slept_col), na.rm = TRUE),\n", - " total_slept_itn = sum(get(\"slept_itn\"), na.rm = TRUE)\n", - "), by = household_id_cols\n", - "]\n", - "\n", - "use_pr_dt[, (indicator_use) := total_slept_itn / total_slept]" - ] - }, - { - "cell_type": "markdown", - "id": "ab6ac712-21d4-478b-9f48-fe9d39c4ffc5", - "metadata": {}, - "source": [ - "### Join with household file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e164f7b2-26c4-4357-8b1d-5acad2de6e54", - "metadata": {}, - "outputs": [], - "source": [ - "use_dt <- merge.data.table(hr_dt, use_pr_dt, by = household_id_cols)" - ] - }, - { - "cell_type": "markdown", - "id": "16738a93-d779-4671-9d4a-790e08860f6d", - "metadata": {}, - "source": [ - "### Compute ITN use indicator" - ] - }, - { - "cell_type": "markdown", - "id": "500726f9-5091-45f7-b164-1ab50c362587", - "metadata": {}, - "source": [ - "#### Account for sampling strategy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12f19a36-722c-47a1-8183-7022e9674ade", - "metadata": {}, - "outputs": [], - "source": [ - "use_design_sampling = svydesign(\n", - " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", - " data = use_dt, # dataset\n", - " strata = ~ HV023, # groupings of primary sampling units\n", - " weights = ~ wt, # the sampling weights variable\n", - " num_p=1, # ? dunno what this is\n", - " nest = T # the primary sampling units are nested within the strata\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95ca0478-3d3a-42b6-85df-41e6e1fe19fa", - "metadata": {}, - "outputs": [], - "source": [ - "bednet_use_table <- svyby(formula = as.formula(paste(\"~\", indicator_use)),\n", - " # by = ~ ADM1,\n", - " by = reformulate(admin_name_col),\n", - " FUN = svymean,\n", - " design = use_design_sampling,\n", - " level = 0.95,\n", - " vartype = \"ci\",\n", - " na.rm = TRUE,\n", - " influence = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4a571c0d-2a47-423c-b716-a46589f7f41f", - "metadata": {}, - "outputs": [], - "source": [ - "setDT(bednet_use_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79b8ae99-1fb9-4611-b17d-4f5c98e3d9e3", - "metadata": {}, - "outputs": [], - "source": [ - "lower_bound_col <- glue(\"{toupper(indicator_use)}_CI_LOWER_BOUND\")\n", - "upper_bound_col <- glue(\"{toupper(indicator_use)}_CI_UPPER_BOUND\")\n", - "sample_avg_col <- glue(\"{toupper(indicator_use)}_SAMPLE_AVERAGE\")\n", - "\n", - "names(bednet_use_table)[names(bednet_use_table) == 'ci_l'] <- lower_bound_col\n", - "names(bednet_use_table)[names(bednet_use_table) == 'ci_u'] <- upper_bound_col\n", - "names(bednet_use_table)[names(bednet_use_table) == indicator_use] <- sample_avg_col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bb41a0b-a67f-4710-9749-498d433ee270", - "metadata": {}, - "outputs": [], - "source": [ - "# Cap the CI's between 0 and 1 (in case of small sample => large CI's)\n", - "bednet_use_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - "bednet_use_table[get(upper_bound_col) > 1, (upper_bound_col) := 1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a6d7558-378a-4d85-a2d2-6916e227ff19", - "metadata": {}, - "outputs": [], - "source": [ - "# Convert to percentages\n", - "bednet_use_table[, (lower_bound_col) := get(lower_bound_col) * 100]\n", - "bednet_use_table[, (upper_bound_col) := get(upper_bound_col) * 100]\n", - "bednet_use_table[, (sample_avg_col) := get(sample_avg_col) * 100]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b886265-bca7-492c-b216-15209da1d515", - "metadata": {}, - "outputs": [], - "source": [ - "bednet_use_table <- merge.data.table(admin_data, bednet_use_table, by = admin_name_col, all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47e8ec4b-b90d-434f-b6a5-5962d43451e5", - "metadata": {}, - "outputs": [], - "source": [ - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_use}\")\n", - "write.csv(bednet_use_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", - "write_parquet(bednet_use_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0aec24e-9feb-4a7a-a491-be7f44617ac0", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "8ba79c20-9f47-4d61-93ab-19d3802125ec", + "metadata": {}, + "source": [ + "# Insecticide-teated net (ITN) access and use, DHS data" + ] + }, + { + "cell_type": "markdown", + "id": "d5d9b645-2094-4b60-a9b2-b89ba33ac4dc", + "metadata": {}, + "source": [ + "## Resources" + ] + }, + { + "cell_type": "markdown", + "id": "90ddd8fd-ec6b-4a29-b78d-8010cdc4d40e", + "metadata": {}, + "source": [ + "https://dhsprogram.com/data/Guide-to-DHS-Statistics/Access_to_an_Insecticide-Treated_Net_ITN.htm\n", + "\n", + "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Use_of_Mosquito_Nets_by_Persons_in_the_Household.htm%23Percentage_of_the1bc-1&rhtocid=_15_3_0\n", + "\n", + "https://dhsprogram.com/publications/publication-dhsg4-dhs-questionnaires-and-manuals.cfm" + ] + }, + { + "cell_type": "markdown", + "id": "53012a66-e2c7-4ecb-9233-27581e349368", + "metadata": {}, + "source": [ + "### Access" + ] + }, + { + "cell_type": "markdown", + "id": "64d4e878-f065-47cc-be00-2b0cd3baeacc", + "metadata": {}, + "source": [ + "Percentage of the de facto household population with access to an ITN in the household, defined as the proportion of the de facto household population who slept under an ITN if each ITN in the household were used by up to two people.\n", + "\n", + "Numerator: Number of de facto persons (hv103 = 1) who could sleep under an ITN if each ITN in the household is used by up to 2 people, calculated for each household as the minimum of:\n", + "\n", + "1. number of de facto persons in the household (hv013), and\n", + "2. twice the number of ITNs per household (2 * sum of hml10_1 – hml10_7 = 1) <- assumed that maximum two people can sleep under a bednet\n", + " \n", + "Denominator: Number of persons who stayed in the household the night before the survey (hv103 = 1)\n", + "\n", + "Variables: hhid (household identification), hml10_1 – _7 (Insecticide-Treated Net (ITN)), hv013 (Number of de facto members) hv103, (Slept last night), hv005 (Household sample)" + ] + }, + { + "cell_type": "markdown", + "id": "5bd2650b-e952-45d1-b46a-b08b777a5961", + "metadata": {}, + "source": [ + "### Use" + ] + }, + { + "cell_type": "markdown", + "id": "327fe0a1-1be5-4d35-a6ff-8913ad56b6c3", + "metadata": {}, + "source": [ + "1) Percentage of the de facto household population who slept the night before the survey under a mosquito net (treated or untreated).\n", + "\n", + "2) Percentage of the de facto household population who slept the night before the survey under an insecticide-treated net (ITN).\n", + "\n", + "3) Among the de facto household population in households with at least one ITN, the percentage who slept under an ITN the night before the survey.\n", + "\n", + "Coverage:\n", + "Population base: De facto household members (PR file, HR file)\n", + "Time period: Night before the survey\n", + "\n", + "Numerators:\n", + "1) Number of de facto persons who reported sleeping under any mosquito net the night before the survey (hv103 = 1 & hml12 in 1:3)\n", + "2) Number of de facto persons who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2)\n", + "3) Number of de facto persons in households with at least one ITN who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2 & any hml10_1 – hml10_7 = 1)\n", + "\n", + "Denominators:\n", + "a) Number of persons in the de facto household population (hv103 = 1)\n", + "b) Number of persons in the de facto household population (hv103 = 1)\n", + "c) Number of persons in the de facto household population in households owning at least one ITN (hv103 = 1 & any hml10_1 – hml10_7 = 1)\n", + "\n", + "Variables: HR file, PR file.\n", + "\n", + "\n", + "**Project uses numerator 2) Number of de facto persons who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2)**\n", + "\n", + "**Project uses denominator b) Number of persons in the de facto household population (hv103 = 1)**" + ] + }, + { + "cell_type": "markdown", + "id": "3b050280-b234-45f1-bac5-8e6910079118", + "metadata": {}, + "source": [ + "## Preliminary steps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3632c310-6a58-4825-8b80-ce3612b6caca", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb98532b-56c9-42c7-9bbd-a6f7869bfc76", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf120a6-1559-4295-93c4-cfbfd141a67b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'bednets')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc24cdd4-2ccb-4511-8a63-8ee4b0c29bde", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_indicator_tables.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7acc95aa-eb5e-421e-b23e-0efa602d1cc1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" + ] + }, + { + "cell_type": "markdown", + "id": "93898419-b98d-4a53-8fc9-a1bb9bff01a4", + "metadata": {}, + "source": [ + "## Geo and admin data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "145fe721-f42a-45ff-a3cb-060886fe7a9e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4938a133-569e-4aec-bf55-7a633a142bc2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load spatial file from dataset \n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31c87079-f8b6-417a-a206-804d5c3208e8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "spatial_data <- st_as_sf(spatial_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98c0e39a-5433-4dc8-a109-f279f7be0271", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}\n", + "\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)" + ] + }, + { + "cell_type": "markdown", + "id": "6399c2eb-9509-4b4f-839a-6c8c83004510", + "metadata": {}, + "source": [ + "## Import DHS data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fb15b9d-3cc5-4c6f-be25-124169388c25", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "data_source <- 'DHS'\n", + "indicator_access <- 'PCT_ITN_ACCESS'\n", + "indicator_use <- 'PCT_ITN_USE'" + ] + }, + { + "cell_type": "markdown", + "id": "0de3a133-4873-43ad-8a33-ed6afa42330b", + "metadata": {}, + "source": [ + "### Unzip data for the analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b88b59b-383c-49a6-b476-9319042243e2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "household_recode <- 'HR'\n", + "person_recode <- 'PR'\n", + "target_file_type <- 'SV'\n", + "\n", + "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf5e0839-612a-430b-ad22-14bc62d6cad5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "dhs_pr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, person_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_pr_zip_filename), exdir=DHS_DATA_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca6e3da7-916c-47e1-b475-2bd5cf551bcd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # Remove existing output files\n", + "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "# files_to_delete <- files[grepl('_ITN_', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", + "# file.remove(files_to_delete)" + ] + }, + { + "cell_type": "markdown", + "id": "02d577f1-007e-40b2-a3f6-e5b41089ee4a", + "metadata": {}, + "source": [ + "### Import data files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "930ef55c-a590-4d76-bcad-902528f8815a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "data_extension <- '.SAV'\n", + "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "dhs_pr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", person_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "\n", + "if(!check_dhs_same_version(dhs_hr_filename, dhs_pr_filename)){\n", + " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60283f74-b656-4596-82ff-aefa487ddd28", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", + "dhs_hr_dt <- setDT(dhs_hr_dt)\n", + "\n", + "dhs_pr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_pr_filename)) # person recode\n", + "dhs_pr_dt <- setDT(dhs_pr_dt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c47b679-8735-4f15-bac5-6efcf912df86", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Make admin codes and names dataframe (for future merging)\n", + "\n", + "dhs_admin_dt <- make_dhs_admin_df(\n", + " input_dhs_df=dhs_hr_dt,\n", + " original_admin_column=\"HV024\",\n", + " new_admin_name_colname=admin_name_col,\n", + " new_admin_code_colname='DHS_ADM1_CODE'\n", + ")\n", + "\n", + "# format the names to be like DHIS2 names\n", + "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", + "\n", + "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", + "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2727c88e-45fb-4011-9fe8-99b55b8313e8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check that all regions can be matched with DHIS2 pyramid\n", + "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", + " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "659a2a79-a563-4062-b7b4-25652e140c5c", + "metadata": {}, + "source": [ + "### Set relevant columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fe61ebf-11b1-41ee-beb5-a553fffc3015", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "household_id_cols <- c(\"HHID\", \"HV000\", \"HV001\", \"HV002\")\n", + "original_household_ITN_cols <- grep('HML10', names(dhs_hr_dt), value = TRUE)\n", + "household_sampling_cols <- c(\"HV005\", \"HV021\", \"HV022\", \"HV023\", \"HV024\")\n", + "household_inhabitants_col <- \"HV013\"\n", + "person_slept_col <- \"HV103\"\n", + "person_id_col <- \"HVIDX\"\n", + "person_bednet_col <- \"HML12\"" + ] + }, + { + "cell_type": "markdown", + "id": "8e71933c-df9f-413a-93eb-38f6e457ce7c", + "metadata": {}, + "source": [ + "## Preprocess Household recode data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72d8d33a-6163-493b-87b9-11e82eae5bf7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# filter columns\n", + "hr_dt <- dhs_hr_dt[, .SD, .SDcols=c(household_id_cols, household_sampling_cols, household_inhabitants_col, original_household_ITN_cols)]\n", + "\n", + "# check i didn't omit any crucial variable\n", + "nrow(hr_dt[duplicated(hr_dt)])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58bec3c3-9c58-4ae9-8939-af545160239d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "sapply(original_household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "436a4f7b-b3a3-4849-8b4a-7205ffb64efd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# make syntactically valid names\n", + "setnames(hr_dt, old = names(hr_dt), new = make.names(names(hr_dt)))\n", + "household_ITN_cols <- grep('HML10', names(hr_dt), value = TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6508361d-f908-41fc-9a6b-ef709fcf24cd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "sapply(household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b89fdf1e-059a-4a39-be91-f7722c18824d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# add admin name column\n", + "hr_dt <- merge.data.table(dhs_admin_dt, hr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"HV024\", all = TRUE)\n", + "\n", + "# sapply(household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))\n", + "\n", + "hr_dt[, (household_ITN_cols) := lapply(.SD, function(x) {\n", + " x <- as.integer(as.character(x)) # convert factors/characters to numeric\n", + " ifelse(is.na(x), 0, x)\n", + "}), .SDcols = household_ITN_cols]\n", + "\n", + "# compute the maximum potential users, given the number of ITNs present in the household\n", + "hr_dt[, max_users := 2 * rowSums(.SD, na.rm = TRUE), .SDcols = household_ITN_cols] # maximum 2 times the number of ITNs in the household\n", + "\n", + "# compute real potential users\n", + "hr_dt[, potential_users := pmin(max_users, HV013, na.rm = TRUE)]\n", + "\n", + "# compute weights\n", + "hr_dt[, wt := HV005/1000000]" + ] + }, + { + "cell_type": "markdown", + "id": "a17d5a0b-de6c-4e06-8567-e03a37936d65", + "metadata": {}, + "source": [ + "## Access to ITN" + ] + }, + { + "cell_type": "markdown", + "id": "c8d3241b-3f42-44fd-9cbc-1c5d3f7d877c", + "metadata": {}, + "source": [ + "### Preprocess person file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6aa78b5e-f5a3-492e-8859-8757fabc6e78", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# filter relevant columns\n", + "access_pr_dt <- dhs_pr_dt[, .SD, .SDcols = c(\n", + " household_id_cols,\n", + " person_id_col,\n", + " person_slept_col\n", + ")]\n", + "\n", + "# # check no necessary column was omitted\n", + "# nrow(access_pr_dt[duplicated(access_pr_dt)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9eb38a46-4c62-4ed8-b33b-e4cb9b7508b3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# make denominator: group and sum, removing NAs\n", + "access_pr_dt <- access_pr_dt[, .(total_slept = sum(get(person_slept_col), na.rm = TRUE)), by = household_id_cols]" + ] + }, + { + "cell_type": "markdown", + "id": "149e766d-0863-4c74-b12e-26727e940a8c", + "metadata": {}, + "source": [ + "### Join with household file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af6a7e45-18b1-4fbf-af8b-c57886920780", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# check merge with household file\n", + "check_perfect_match(hr_dt, 'HHID', access_pr_dt, 'HHID')\n", + "\n", + "# lapply(household_id_cols, function(i) check_perfect_match(hr_dt, i, access_pr_dt, i))\n", + "if(!all(unlist((lapply(household_id_cols, function(i) check_perfect_match(hr_dt, i, access_pr_dt, i)))))){\n", + " print('Person and Household data does not match')\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02c1997c-f5c6-4eee-b889-a8c8cc81dd0f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "access_dt <- merge.data.table(hr_dt, access_pr_dt, by = household_id_cols, all = TRUE)\n", + "\n", + "# filter rows\n", + "access_dt <- access_dt[total_slept > 0] # to not divide by 0 (only households where someone slept last night)" + ] + }, + { + "cell_type": "markdown", + "id": "50e7c498-1141-46a4-b4fc-b5c4617fb956", + "metadata": {}, + "source": [ + "DHS guidelines for the calculation of “potential users”: \"In households which have more than 1 ITN for every 2 people, the product of this calculation will be greater than the number of individuals who spent the previous night. In this case, the “potential users” variable in that household should be modified to reflect the number of individuals who spent the previous night in the household because the number of potential users in a household cannot exceed the number of individuals who spent the previous night in that household.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b20ff38-d658-43c8-83d5-5357bc7a293a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "access_dt[, foo := fifelse(\n", + " potential_users > total_slept,\n", + " total_slept,\n", + " potential_users\n", + ")]" + ] + }, + { + "cell_type": "markdown", + "id": "fddd66b0-c351-4aa4-bdaa-47542fbdee9d", + "metadata": {}, + "source": [ + "### Compute ITN access indicator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "caa18abf-6589-4846-a705-897ce3943692", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "access_dt[, (indicator_access) := potential_users / total_slept]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ff9c48c-1c35-4e14-a4bf-417040e9e4f7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "summary(access_dt[[indicator_access]])" + ] + }, + { + "cell_type": "markdown", + "id": "9c53f039-299a-4c02-a728-77a77d088472", + "metadata": {}, + "source": [ + "#### Account for the sampling strategy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9df627ce-0424-4341-9fcd-b1f13cd769b6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", + "access_design_sampling = svydesign(\n", + " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", + " data = access_dt, # dataset\n", + " strata = ~ HV023, # groupings of primary sampling units\n", + " weights = ~ wt, # the sampling weights variable\n", + " num_p=1, # ? dunno what this is\n", + " nest = T # the primary sampling units are nested within the strata\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f699992f-e931-4657-abf3-7a2e13a89dbf", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}\")\n", + "bednet_access_table <- compute_and_export_indicator_table(\n", + " design_obj = access_design_sampling,\n", + " indicator_name = indicator_access,\n", + " admin_name_col = admin_name_col,\n", + " admin_data = admin_data,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " filename_without_extension = filename_without_extension\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfc79978-0f59-444b-a8f1-3420864489b1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Access indicator table computed and exported in previous cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e71c083-0b67-43db-8f0b-eda14d8a2be3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# handled by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c92b8b6-d343-4a69-b508-ae23ebbae48b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# handled by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98542095-2e06-4860-b038-c7eeceeb6265", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# handled by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75e920e5-3715-4971-bc6c-62291dc59fc5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# handled by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67cb29d2-ee18-4d28-9c72-e253f8dcb5b6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# handled by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54935f07-b254-458d-b1fb-8c7a921b5bd9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(bednet_access_table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85c1bf55-39d6-4c70-88bf-46542f158b0c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# already exported by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "markdown", + "id": "484e0a38-8c37-4213-a750-babfaf9107bc", + "metadata": {}, + "source": [ + "## ITN use" + ] + }, + { + "cell_type": "markdown", + "id": "1ebdff14-edbd-4fd5-85f5-230b1e27adb7", + "metadata": {}, + "source": [ + "### Preprocess person file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a3536b2-6fc4-412f-91bc-21466b688e59", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# filter columns\n", + "use_pr_dt <- dhs_pr_dt[, .SD, .SDcols=c(household_id_cols, person_id_col, person_slept_col, person_bednet_col)]\n", + "\n", + "# check no necessary column was omitted\n", + "nrow(use_pr_dt[duplicated(use_pr_dt)])\n", + "\n", + "# # for(i in person_slept_col){print(table(access_pr_dt[[i]]))}\n", + "# sapply(person_bednet_col, function(i) table(use_pr_dt[[i]], useNA = 'always'))" + ] + }, + { + "cell_type": "markdown", + "id": "a303e6ec-a3ac-4a20-8f3b-40a06d8067e3", + "metadata": {}, + "source": [ + "The DHS guide ( https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Use_of_Mosquito_Nets_by_Persons_in_the_Household.htm ) suggests to use both 1 & 2 as possible values for HML12; but 2 is \"Both treated (ITN) and untreated nets\"; using as specified in the guide, but to be kept in mind" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35211f4e-e84d-4fbc-be01-577912300d29", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # group and sum, removing NAs and keeping only 1 as valid value\n", + "# use_pr_dt <- use_pr_dt[, slept_itn := as.integer(\n", + "# get(person_slept_col) == 1 & (get(person_bednet_col) == 1)\n", + "# )]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "132e025d-a699-4c0c-a20d-d4f7503811a6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# group and sum, removing NAs\n", + "use_pr_dt <- use_pr_dt[, slept_itn := as.integer(\n", + " get(person_slept_col) == 1 & (get(person_bednet_col) %in% c(1, 2))\n", + ")]\n", + "\n", + "# check recodings are correct\n", + "xtabs(~ get(person_slept_col) + get(person_bednet_col) + slept_itn, data = use_pr_dt, addNA = TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c883b64-605c-4825-b16f-5d4b2f6febf9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "use_pr_dt <- use_pr_dt[, .(\n", + " total_slept = sum(get(person_slept_col), na.rm = TRUE),\n", + " total_slept_itn = sum(get(\"slept_itn\"), na.rm = TRUE)\n", + "), by = household_id_cols\n", + "]\n", + "\n", + "use_pr_dt[, (indicator_use) := total_slept_itn / total_slept]" + ] + }, + { + "cell_type": "markdown", + "id": "ab6ac712-21d4-478b-9f48-fe9d39c4ffc5", + "metadata": {}, + "source": [ + "### Join with household file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e164f7b2-26c4-4357-8b1d-5acad2de6e54", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "use_dt <- merge.data.table(hr_dt, use_pr_dt, by = household_id_cols)" + ] + }, + { + "cell_type": "markdown", + "id": "16738a93-d779-4671-9d4a-790e08860f6d", + "metadata": {}, + "source": [ + "### Compute ITN use indicator" + ] + }, + { + "cell_type": "markdown", + "id": "500726f9-5091-45f7-b164-1ab50c362587", + "metadata": {}, + "source": [ + "#### Account for sampling strategy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12f19a36-722c-47a1-8183-7022e9674ade", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "use_design_sampling = svydesign(\n", + " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", + " data = use_dt, # dataset\n", + " strata = ~ HV023, # groupings of primary sampling units\n", + " weights = ~ wt, # the sampling weights variable\n", + " num_p=1, # ? dunno what this is\n", + " nest = T # the primary sampling units are nested within the strata\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95ca0478-3d3a-42b6-85df-41e6e1fe19fa", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_use}\")\n", + "bednet_use_table <- compute_and_export_indicator_table(\n", + " design_obj = use_design_sampling,\n", + " indicator_name = indicator_use,\n", + " admin_name_col = admin_name_col,\n", + " admin_data = admin_data,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " filename_without_extension = filename_without_extension\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a571c0d-2a47-423c-b716-a46589f7f41f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Use indicator table computed and exported in previous cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79b8ae99-1fb9-4611-b17d-4f5c98e3d9e3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# handled by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bb41a0b-a67f-4710-9749-498d433ee270", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# handled by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a6d7558-378a-4d85-a2d2-6916e227ff19", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# handled by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b886265-bca7-492c-b216-15209da1d515", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# handled by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47e8ec4b-b90d-434f-b6a5-5962d43451e5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# already exported by compute_and_export_indicator_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0aec24e-9feb-4a7a-a491-be7f44617ac0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb index 39e5171..aa7435a 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb @@ -1,605 +1,557 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "c9aadda5-521c-4345-916c-a60f9802a853", - "metadata": {}, - "source": [ - "# Careseeking behavior upon child fever (DHS data)" - ] - }, - { - "cell_type": "markdown", - "id": "4ce067b3-f707-4df6-b7de-30389f125534", - "metadata": {}, - "source": [ - "## Resources" - ] - }, - { - "cell_type": "markdown", - "id": "a71afb11-ea04-40e9-8566-4b80a0857e25", - "metadata": {}, - "source": [ - "https://dhsprogram.com/data/Guide-to-DHS-Statistics/Fever_and_Careseeking.htm?rhtocid=_13_3_0#Percentage_of_children4\n", - "\n", - "1) Percentage of children under age 5 years with fever in the 2 weeks preceding the survey.\n", - "2) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought.\n", - "3) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought the same or next day following the onset of fever.\n", - "4) Among children under age 5 with fever in the 2 weeks preceding the survey, percentage who took antibiotic drugs.\n", - "\n", - "Coverage:\n", - "\n", - "Population base: Living children under age 5 years (KR file)\n", - "\n", - "Time period: Two weeks preceding the survey\n", - "\n", - "Numerators:\n", - "1) Number of living children under age 5 years with fever at any time during the 2 weeks preceding the survey (b5 = 1 & b19 < 60 & h22 = 1)\n", - "2) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview for whom advice or treatment was sought (b5 = 1 & b19 < 60 & h22 = 1 & any of h32a – x = 1 except traditional practitioner (usually h32t))\n", - "3) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview for whom advice or treatment was sought the same day or next day following the onset of fever (b5 = 1 & b19 < 60 & h22 = 1 & any of h32a – x = 1 excluding advice or treatment from a traditional practitioner (usually h32t) & h46b in 0:1)\n", - "4) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview who took antibiotic drugs (b5 = 1 & h22 = 1 & (h37i = 1 or h37j = 1 or h37n= 1 or h37o = 1) [or ml13i = 1 or ml13j = 1 or ml13n = 1 or ml13o = 1])\n", - "\n", - "Denominators:\n", - "- Numerator 1: Number of living children under age 5 (b5 = 1 & b19 < 60)\n", - "\n", - "- Numerators 2, 3, and 4: Number of living children under age 5 with fever at any time during the 2 weeks preceding the survey (b5 = 1 & b19 < 60 and h22 = 1).\n", - "\n", - "Project uses (split by \"private/public\"): \"2) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought.\"" - ] - }, - { - "cell_type": "markdown", - "id": "570901a3-7312-4583-bb4f-753e7e7c0ca2", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58c2d4af-aa4e-45ad-9ef7-6dba584f12ca", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "690423e8-6e7d-49fc-8f01-4a1f445fa537", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "687e8bab-120e-4367-b4fe-43c3dee11185", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "markdown", - "id": "e07d8a08-5bae-4189-80bf-57d6ba653e83", - "metadata": {}, - "source": [ - "## Spatial/admin data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18fd6f07-2e0f-4e26-90d4-ac07e052379a", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "c9aadda5-521c-4345-916c-a60f9802a853", + "metadata": {}, + "source": [ + "# Careseeking behavior upon child fever (DHS data)" + ] + }, + { + "cell_type": "markdown", + "id": "4ce067b3-f707-4df6-b7de-30389f125534", + "metadata": {}, + "source": [ + "## Resources" + ] + }, + { + "cell_type": "markdown", + "id": "a71afb11-ea04-40e9-8566-4b80a0857e25", + "metadata": {}, + "source": [ + "https://dhsprogram.com/data/Guide-to-DHS-Statistics/Fever_and_Careseeking.htm?rhtocid=_13_3_0#Percentage_of_children4\n", + "\n", + "1) Percentage of children under age 5 years with fever in the 2 weeks preceding the survey.\n", + "2) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought.\n", + "3) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought the same or next day following the onset of fever.\n", + "4) Among children under age 5 with fever in the 2 weeks preceding the survey, percentage who took antibiotic drugs.\n", + "\n", + "Coverage:\n", + "\n", + "Population base: Living children under age 5 years (KR file)\n", + "\n", + "Time period: Two weeks preceding the survey\n", + "\n", + "Numerators:\n", + "1) Number of living children under age 5 years with fever at any time during the 2 weeks preceding the survey (b5 = 1 & b19 < 60 & h22 = 1)\n", + "2) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview for whom advice or treatment was sought (b5 = 1 & b19 < 60 & h22 = 1 & any of h32a – x = 1 except traditional practitioner (usually h32t))\n", + "3) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview for whom advice or treatment was sought the same day or next day following the onset of fever (b5 = 1 & b19 < 60 & h22 = 1 & any of h32a – x = 1 excluding advice or treatment from a traditional practitioner (usually h32t) & h46b in 0:1)\n", + "4) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview who took antibiotic drugs (b5 = 1 & h22 = 1 & (h37i = 1 or h37j = 1 or h37n= 1 or h37o = 1) [or ml13i = 1 or ml13j = 1 or ml13n = 1 or ml13o = 1])\n", + "\n", + "Denominators:\n", + "- Numerator 1: Number of living children under age 5 (b5 = 1 & b19 < 60)\n", + "\n", + "- Numerators 2, 3, and 4: Number of living children under age 5 with fever at any time during the 2 weeks preceding the survey (b5 = 1 & b19 < 60 and h22 = 1).\n", + "\n", + "Project uses (split by \"private/public\"): \"2) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought.\"" + ] + }, + { + "cell_type": "markdown", + "id": "570901a3-7312-4583-bb4f-753e7e7c0ca2", + "metadata": {}, + "source": [ + "## Preliminary steps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58c2d4af-aa4e-45ad-9ef7-6dba584f12ca", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "690423e8-6e7d-49fc-8f01-4a1f445fa537", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "\n", + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')\n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_indicator_tables.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_careseeking_computation.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "687e8bab-120e-4367-b4fe-43c3dee11185", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" + ] + }, + { + "cell_type": "markdown", + "id": "e07d8a08-5bae-4189-80bf-57d6ba653e83", + "metadata": {}, + "source": [ + "## Spatial/admin data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18fd6f07-2e0f-4e26-90d4-ac07e052379a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e0ebcf5-45d4-4c40-be95-cf27b0a3ae75", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}\n", + "\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)" + ] + }, + { + "cell_type": "markdown", + "id": "51c988f2-1fb5-4e35-8920-c5d341a543f3", + "metadata": {}, + "source": [ + "## Import DHS data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "599d0f6e-d0f5-42fd-8e78-6a87bdbd12a6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "data_source <- 'DHS'\n", + "\n", + "indicator_public_care <- 'PCT_PUBLIC_CARE'\n", + "indicator_private_care <- 'PCT_PRIVATE_CARE'\n", + "indicator_no_care <- 'PCT_NO_CARE'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9589f2b2-c0ad-4a46-8778-faa4f427eb37", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "household_recode <- 'HR'\n", + "kid_recode <- 'KR'\n", + "target_file_type <- 'SV'\n", + "\n", + "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", + "\n", + "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "dhs_kr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, kid_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_kr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "# # Remove existing output files\n", + "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "# files_to_delete <- files[grepl('U5_PREV', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", + "# file.remove(files_to_delete)\n", + "\n", + "data_extension <- '.SAV'\n", + "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "dhs_kr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", kid_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "\n", + "if(!check_dhs_same_version(dhs_hr_filename, dhs_kr_filename)){\n", + " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", + "}\n", + "\n", + "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", + "dhs_hr_dt <- setDT(dhs_hr_dt)\n", + "\n", + "dhs_kr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_kr_filename)) # kid recode\n", + "dhs_kr_dt <- setDT(dhs_kr_dt)" + ] + }, + { + "cell_type": "markdown", + "id": "c26e24f7-0a59-4dbf-b493-a760c9d30e39", + "metadata": {}, + "source": [ + "### Make admin codes and names dataframe (for future merging)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f244f017-b532-4106-8beb-4bf7c0ed4d02", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", + "\n", + "dhs_admin_dt <- make_dhs_admin_df(\n", + " input_dhs_df=dhs_hr_dt,\n", + " original_admin_column=\"HV024\",\n", + " new_admin_name_colname=admin_name_col,\n", + " new_admin_code_colname='DHS_ADM1_CODE'\n", + ")\n", + "\n", + "# format the names to be like DHIS2 names\n", + "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", + "\n", + "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", + "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", + "\n", + "# Check that all regions can be matched with DHIS2 pyramid\n", + "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", + " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52af343a-bbee-4601-9e3f-adceb60d809f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "rm(dhs_hr_dt) # will not be used further" + ] + }, + { + "cell_type": "markdown", + "id": "c35b457b-e6ac-42c6-bad5-4ee328e94177", + "metadata": {}, + "source": [ + "### Relevant columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ea43f49-f9dd-4403-a3b4-885ff31da8bb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "kid_id_cols <- c(\"CASEID\", \"V000\", \"V001\", \"V002\")\n", + "household_sampling_cols <- c(\"V005\", \"V021\", \"V022\", \"V023\", \"V024\")\n", + "kid_alive_col <- \"B5\"\n", + "kid_age_col <- \"B19\"\n", + "kid_fever_col <- \"H22\"\n", + "# grep(\"^H32\", names(dhs_kr_dt), value = TRUE)\n", + "kid_public_care_cols <- c(\"H32A\", \"H32B\", \"H32C\", \"H32D\", \"H32E\", \"H32F\", \"H32G\", \"H32H\", \"H32I\")\n", + "kid_private_care_cols <- c(\"H32J\", \"H32K\", \"H32L\", \"H32M\", \"H32N\", \"H32O\", \"H32P\", \"H32Q\", \"H32R\",\n", + " \"H32NA\", \"H32NB\", \"H32NC\", \"H32ND\", \"H32NE\")\n", + "kid_other_care_cols <- c(\"H32S\", \"H32W\", \"H32X\")" + ] + }, + { + "cell_type": "markdown", + "id": "6b0924e4-aef6-4e1c-be29-4f818fb14d8e", + "metadata": {}, + "source": [ + "## Preprocess kid file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5184585f-bb68-4e99-89e2-ff52ffbf2eed", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# filter rows and columns\n", + "kr_dt <- dhs_kr_dt[(\n", + " !(is.na(get(kid_age_col))) & # no missing age\n", + " get(kid_age_col) < 60 & # younger than 5\n", + " get(kid_alive_col) == 1 & # alive\n", + " get(kid_fever_col) == 1 # had fever\n", + " ),\n", + " .SD, .SDcols = c(\n", + " kid_id_cols,\n", + " household_sampling_cols,\n", + " kid_alive_col,\n", + " kid_age_col,\n", + " kid_fever_col,\n", + " kid_other_care_cols,\n", + " kid_public_care_cols,\n", + " kid_private_care_cols\n", + " )]\n", + "\n", + "kr_dt[, wt := V005/1000000]\n", + "\n", + "kr_dt <- merge.data.table(dhs_admin_dt, kr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"V024\", all = TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c3f8447-842f-414c-b47f-869684c05c37", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create the numerators\n", + "\n", + "kr_dt[, (indicator_public_care) := as.integer(rowSums(.SD == 1, na.rm = TRUE) > 0), .SDcols = kid_public_care_cols]\n", + "kr_dt[, (indicator_private_care) := as.integer(rowSums(.SD == 1, na.rm = TRUE) > 0), .SDcols = kid_private_care_cols]\n", + "kr_dt[, (indicator_no_care) := as.integer(rowSums(.SD != 0, na.rm = TRUE) == 0), .SDcols = c(kid_public_care_cols, kid_private_care_cols)]\n", + "\n", + "# check\n", + "xtabs(~ kr_dt[[indicator_public_care]] + kr_dt[[indicator_private_care]] + kr_dt[[indicator_no_care]])" + ] + }, + { + "cell_type": "markdown", + "id": "ead4ca71-3a5c-445f-ae61-3359a7cc7e31", + "metadata": {}, + "source": [ + "### Sampling design" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81f99ad0-b0fa-4be8-b6df-9b726a0eccad", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", + "kr_design_sampling = svydesign(\n", + " ids = ~ V021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", + " data = kr_dt, # dataset\n", + " strata = ~ V023, # groupings of primary sampling units\n", + " weights = ~ wt, # the sampling weights variable\n", + " num_p=1, # ? dunno what this is\n", + " nest = T # the primary sampling units are nested within the strata\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "88f9844d-3f29-4cc9-91e7-351bc81b6d69", + "metadata": {}, + "source": [ + "## Compute indicators" + ] + }, + { + "cell_type": "markdown", + "id": "0bec1af7-62e7-4da6-8531-1ed6f9933dc0", + "metadata": {}, + "source": [ + "Make the indicator tables and save them, add the sample estimation of the average proportion to a summary table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "357ecff3-538e-4265-b024-51f678bcf29f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "indicator_results <- compute_careseeking_indicators(\n", + " kr_design_sampling = kr_design_sampling,\n", + " indicator_names = c(indicator_public_care, indicator_private_care, indicator_no_care),\n", + " admin_name_col = admin_name_col,\n", + " admin_data = admin_data,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_source = data_source,\n", + " admin_level = admin_level\n", + ")\n", + "\n", + "summary_table <- indicator_results$summary_table\n", + "pct_public_care_table <- indicator_results$indicator_tables[[indicator_public_care]]\n", + "pct_private_care_table <- indicator_results$indicator_tables[[indicator_private_care]]\n", + "pct_no_care_table <- indicator_results$indicator_tables[[indicator_no_care]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f40da01a-4558-43af-9d1a-136b7c6920db", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Summary table already prepared and renamed in compute_careseeking_indicators()\n", + "head(summary_table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3896b18f-ad5f-49c2-a2cc-2ebf40208aca", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Summary output already exported in compute_careseeking_indicators()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2449bdb3-61cf-4ddd-a272-2172da6cba2e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] } - }, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e0ebcf5-45d4-4c40-be95-cf27b0a3ae75", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}\n", - "\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)" - ] - }, - { - "cell_type": "markdown", - "id": "51c988f2-1fb5-4e35-8920-c5d341a543f3", - "metadata": {}, - "source": [ - "## Import DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "599d0f6e-d0f5-42fd-8e78-6a87bdbd12a6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "\n", - "indicator_public_care <- 'PCT_PUBLIC_CARE'\n", - "indicator_private_care <- 'PCT_PRIVATE_CARE'\n", - "indicator_no_care <- 'PCT_NO_CARE'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9589f2b2-c0ad-4a46-8778-faa4f427eb37", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "household_recode <- 'HR'\n", - "kid_recode <- 'KR'\n", - "target_file_type <- 'SV'\n", - "\n", - "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", - "\n", - "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "dhs_kr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, kid_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_kr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "# # Remove existing output files\n", - "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "# files_to_delete <- files[grepl('U5_PREV', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", - "# file.remove(files_to_delete)\n", - "\n", - "data_extension <- '.SAV'\n", - "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "dhs_kr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", kid_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "\n", - "if(!check_dhs_same_version(dhs_hr_filename, dhs_kr_filename)){\n", - " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", - "}\n", - "\n", - "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", - "dhs_hr_dt <- setDT(dhs_hr_dt)\n", - "\n", - "dhs_kr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_kr_filename)) # kid recode\n", - "dhs_kr_dt <- setDT(dhs_kr_dt)" - ] - }, - { - "cell_type": "markdown", - "id": "c26e24f7-0a59-4dbf-b493-a760c9d30e39", - "metadata": {}, - "source": [ - "### Make admin codes and names dataframe (for future merging)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f244f017-b532-4106-8beb-4bf7c0ed4d02", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", - "\n", - "dhs_admin_dt <- make_dhs_admin_df(\n", - " input_dhs_df=dhs_hr_dt,\n", - " original_admin_column=\"HV024\",\n", - " new_admin_name_colname=admin_name_col,\n", - " new_admin_code_colname='DHS_ADM1_CODE'\n", - ")\n", - "\n", - "# format the names to be like DHIS2 names\n", - "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", - "\n", - "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", - "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", - "\n", - "# Check that all regions can be matched with DHIS2 pyramid\n", - "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", - " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52af343a-bbee-4601-9e3f-adceb60d809f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "rm(dhs_hr_dt) # will not be used further" - ] - }, - { - "cell_type": "markdown", - "id": "c35b457b-e6ac-42c6-bad5-4ee328e94177", - "metadata": {}, - "source": [ - "### Relevant columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ea43f49-f9dd-4403-a3b4-885ff31da8bb", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "kid_id_cols <- c(\"CASEID\", \"V000\", \"V001\", \"V002\")\n", - "household_sampling_cols <- c(\"V005\", \"V021\", \"V022\", \"V023\", \"V024\")\n", - "kid_alive_col <- \"B5\"\n", - "kid_age_col <- \"B19\"\n", - "kid_fever_col <- \"H22\"\n", - "# grep(\"^H32\", names(dhs_kr_dt), value = TRUE)\n", - "kid_public_care_cols <- c(\"H32A\", \"H32B\", \"H32C\", \"H32D\", \"H32E\", \"H32F\", \"H32G\", \"H32H\", \"H32I\")\n", - "kid_private_care_cols <- c(\"H32J\", \"H32K\", \"H32L\", \"H32M\", \"H32N\", \"H32O\", \"H32P\", \"H32Q\", \"H32R\",\n", - " \"H32NA\", \"H32NB\", \"H32NC\", \"H32ND\", \"H32NE\")\n", - "kid_other_care_cols <- c(\"H32S\", \"H32W\", \"H32X\")" - ] - }, - { - "cell_type": "markdown", - "id": "6b0924e4-aef6-4e1c-be29-4f818fb14d8e", - "metadata": {}, - "source": [ - "## Preprocess kid file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5184585f-bb68-4e99-89e2-ff52ffbf2eed", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# filter rows and columns\n", - "kr_dt <- dhs_kr_dt[(\n", - " !(is.na(get(kid_age_col))) & # no missing age\n", - " get(kid_age_col) < 60 & # younger than 5\n", - " get(kid_alive_col) == 1 & # alive\n", - " get(kid_fever_col) == 1 # had fever\n", - " ),\n", - " .SD, .SDcols = c(\n", - " kid_id_cols,\n", - " household_sampling_cols,\n", - " kid_alive_col,\n", - " kid_age_col,\n", - " kid_fever_col,\n", - " kid_other_care_cols,\n", - " kid_public_care_cols,\n", - " kid_private_care_cols\n", - " )]\n", - "\n", - "kr_dt[, wt := V005/1000000]\n", - "\n", - "kr_dt <- merge.data.table(dhs_admin_dt, kr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"V024\", all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c3f8447-842f-414c-b47f-869684c05c37", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create the numerators\n", - "\n", - "kr_dt[, (indicator_public_care) := as.integer(rowSums(.SD == 1, na.rm = TRUE) > 0), .SDcols = kid_public_care_cols]\n", - "kr_dt[, (indicator_private_care) := as.integer(rowSums(.SD == 1, na.rm = TRUE) > 0), .SDcols = kid_private_care_cols]\n", - "kr_dt[, (indicator_no_care) := as.integer(rowSums(.SD != 0, na.rm = TRUE) == 0), .SDcols = c(kid_public_care_cols, kid_private_care_cols)]\n", - "\n", - "# check\n", - "xtabs(~ kr_dt[[indicator_public_care]] + kr_dt[[indicator_private_care]] + kr_dt[[indicator_no_care]])" - ] - }, - { - "cell_type": "markdown", - "id": "ead4ca71-3a5c-445f-ae61-3359a7cc7e31", - "metadata": {}, - "source": [ - "### Sampling design" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "81f99ad0-b0fa-4be8-b6df-9b726a0eccad", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", - "kr_design_sampling = svydesign(\n", - " ids = ~ V021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", - " data = kr_dt, # dataset\n", - " strata = ~ V023, # groupings of primary sampling units\n", - " weights = ~ wt, # the sampling weights variable\n", - " num_p=1, # ? dunno what this is\n", - " nest = T # the primary sampling units are nested within the strata\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "88f9844d-3f29-4cc9-91e7-351bc81b6d69", - "metadata": {}, - "source": [ - "## Compute indicators" - ] - }, - { - "cell_type": "markdown", - "id": "0bec1af7-62e7-4da6-8531-1ed6f9933dc0", - "metadata": {}, - "source": [ - "Make the indicator tables and save them, add the sample estimation of the average proportion to a summary table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "357ecff3-538e-4265-b024-51f678bcf29f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "summary_table <- copy(admin_data)\n", - "\n", - "for (indicator_name in c(indicator_public_care, indicator_private_care, indicator_no_care)){\n", - " \n", - " # make the table name\n", - " table_name <- glue(tolower(indicator_name), 'table', .sep = '_')\n", - " \n", - " # create the content for the table\n", - " table_content <- svyby(\n", - " formula = as.formula(paste(\"~\", indicator_name)),\n", - " by = reformulate(admin_name_col),\n", - " FUN = svymean,\n", - " design = kr_design_sampling,\n", - " level = 0.95,\n", - " vartype = \"ci\",\n", - " na.rm = TRUE,\n", - " influence = TRUE # which observations have a substantial change in the results of the analysis\n", - " )\n", - " \n", - " # make it into data.table\n", - " setDT(table_content)\n", - "\n", - " lower_bound_col <- glue(\"{toupper(indicator_name)}_CI_LOWER_BOUND\")\n", - " upper_bound_col <- glue(\"{toupper(indicator_name)}_CI_UPPER_BOUND\")\n", - " sample_avg_col <- glue(\"{toupper(indicator_name)}_SAMPLE_AVERAGE\")\n", - " \n", - " # names(table_content) <- toupper(names(table_content))\n", - " names(table_content)[names(table_content) == 'ci_l'] <- lower_bound_col\n", - " names(table_content)[names(table_content) == 'ci_u'] <- upper_bound_col\n", - " names(table_content)[names(table_content) == indicator_name] <- sample_avg_col\n", - " \n", - " # cap the CI's between 0 and 1 (in case of small sample => large CI's)\n", - " table_content[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - " table_content[get(upper_bound_col) > 1, (upper_bound_col) := 1]\n", - "\n", - " # convert to percentages\n", - " table_content[, (lower_bound_col) := get(lower_bound_col) * 100]\n", - " table_content[, (upper_bound_col) := get(upper_bound_col) * 100]\n", - " table_content[, (sample_avg_col) := get(sample_avg_col) * 100]\n", - " \n", - " # add the sample average column to the summary table\n", - " indicator_estimation_table <- table_content[\n", - " ,\n", - " .SD,\n", - " .SDcols = c(\n", - " admin_name_col,\n", - " grep('SAMPLE_AVERAGE', names(table_content), value = TRUE)\n", - " )\n", - " ]\n", - " \n", - " # add the admin id column to the indicator output table\n", - " table_content <- merge.data.table(admin_data, table_content, by = admin_name_col)\n", - " \n", - " # add the admin id column to the summary point estimates table\n", - " summary_table <- merge.data.table(summary_table, indicator_estimation_table, by = admin_name_col)\n", - " \n", - " # write it to .csv and .parquet files\n", - " filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_name)}\")\n", - " write.csv(table_content, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", - " write_parquet(table_content, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))\n", - " \n", - " # assign the content to its variable name\n", - " assign(table_name, table_content)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f40da01a-4558-43af-9d1a-136b7c6920db", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# changing names for the summary table\n", - "names(summary_table) <- gsub('_SAMPLE_AVERAGE', '', names(summary_table))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3896b18f-ad5f-49c2-a2cc-2ebf40208aca", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "summary_filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_PCT_CARESEEKING_SAMPLE_AVERAGE\")\n", - "write.csv(summary_table, file = file.path(OUTPUT_DATA_PATH, paste0(summary_filename_without_extension, '.csv')), row.names = FALSE)\n", - "write_parquet(summary_table, file.path(OUTPUT_DATA_PATH, paste0(summary_filename_without_extension, '.parquet')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2449bdb3-61cf-4ddd-a272-2172da6cba2e", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb index ff19854..d7b950b 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb @@ -1,442 +1,483 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "1a337757-f2fa-467e-8241-ea174c7ea790", - "metadata": {}, - "source": [ - "# Under-5 Prevalence of Malaria (DHS data)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Under-5 Prevalence of Malaria (DHS data)" + ], + "id": "1a337757-f2fa-467e-8241-ea174c7ea790" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resources" + ], + "id": "fc27d9c1-0c0c-46df-9508-57add133acaf" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Prevalence_of_Malaria_in_Children.htm%23Percentage_of_children22bc-1&rhtocid=_15_13_0\n", + "\n", + "Numerators:\n", + "1) Number of de facto children tested using RDT who are positive for malaria (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 = 1)\n", + "2) Number of de facto children tested using microscopy who are positive for malaria (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 = 1)\n", + " \n", + "Denominators:\n", + "a) Number of de facto children tested using RDT (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 in 0,1)\n", + "b) Number of de facto children tested using microscopy (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 in 0,1,6)" + ], + "id": "9cf1724b-b01c-4b2e-93f7-d0f54cfa4850" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Project uses RDT**" + ], + "id": "7c61058a-361f-4992-9f2b-99a82f798fd8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminary steps" + ], + "id": "d0e715b0-9a8d-4d15-b7ef-486f16fec73b" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)\n", + "\n", + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "\n", + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'prevalence')\n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_indicator_tables.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)\n", + "\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n" + ], + "execution_count": null, + "outputs": [], + "id": "a3cc2680-f9cc-46cd-9565-2b8af14fc29a" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get data" + ], + "id": "43135a7a-f1b8-4a89-9db1-9012c3369f3d" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ], + "execution_count": null, + "outputs": [], + "id": "7e83a6c1-cd47-4eeb-b949-7bef7d1f36c6" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}\n", + "\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)" + ], + "execution_count": null, + "outputs": [], + "id": "9a623578-170f-42e0-a012-78f0ddbcce87" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import " + ], + "id": "11aa09f3-ecc8-46c0-b9f3-95c0520202ef" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "indicator_u5prev <- 'PCT_U5_PREV_RDT' # to be computed\n", + "\n", + "data_source <- 'DHS'\n", + "household_recode <- 'HR'\n", + "person_recode <- 'PR'\n", + "target_file_type <- 'SV'\n", + "\n", + "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", + "\n", + "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "dhs_pr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, person_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_pr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "# # Remove existing output files\n", + "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "# files_to_delete <- files[grepl('U5_PREV', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", + "# file.remove(files_to_delete)\n", + "\n", + "data_extension <- '.SAV'\n", + "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "dhs_pr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", person_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "\n", + "if(!check_dhs_same_version(dhs_hr_filename, dhs_pr_filename)){\n", + " stop(\"The input DHS data do not have the same version/issue. Check available data before rerunning.\")\n", + "}\n", + "\n", + "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", + "dhs_hr_dt <- setDT(dhs_hr_dt)\n", + "\n", + "dhs_pr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_pr_filename)) # person recode\n", + "dhs_pr_dt <- setDT(dhs_pr_dt)" + ], + "execution_count": null, + "outputs": [], + "id": "c015490b-a8fc-471d-a83f-0dce8e010cef" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Make admin dataframe (for future merging)" + ], + "id": "8b191b87-3694-4883-99c9-66ade3477f8e" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhs_admin_dt <- make_dhs_admin_df(\n", + " input_dhs_df=dhs_hr_dt,\n", + " original_admin_column=\"HV024\",\n", + " new_admin_name_colname=admin_name_col,\n", + " new_admin_code_colname='DHS_ADM1_CODE'\n", + ")\n", + "\n", + "# format the names to be like DHIS2 names\n", + "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", + "\n", + "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", + "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", + "\n", + "# Check that all regions can be matched with DHIS2 pyramid\n", + "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", + " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", + "}\n", + "\n", + "rm(dhs_hr_dt) # free up resources" + ], + "execution_count": null, + "outputs": [], + "id": "e0767159-1660-48c3-bdaf-f8be3643e039" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preprocess" + ], + "id": "8a47d5f6-f8d8-4373-9cc8-a776d85b8a75" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Relevant columns\n", + "household_id_cols <- c(\"HHID\", \"HV000\", \"HV001\", \"HV002\")\n", + "household_sampling_cols <- c(\"HV005\", \"HV021\", \"HV022\", \"HV023\", \"HV024\")\n", + "hemoglobin_selection_col = \"HV042\"\n", + "person_slept_col <- grep(\"^HV103\", names(dhs_pr_dt), value = TRUE)\n", + "kid_age_col <- \"HC1\"\n", + "smear_result_col <- \"HML32\" # smear test (GE)\n", + "rdt_result_col <- \"HML35\" # rapid diagnostic test (RDT / TDR)" + ], + "execution_count": null, + "outputs": [], + "id": "f49ecb55-da48-4cf5-aa2b-7691eed04b77" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# sapply(kid_age_cols, function(i) table(dhs_pr_dt[[i]], useNA = 'always'))\n", + "\n", + "# table(dhs_pr_dt$HC1, useNA = 'ifany')\n", + "# table(dhs_pr_dt$HV103, useNA = 'ifany')\n", + "# table(dhs_pr_dt$HV042, useNA = 'ifany')\n", + "\n", + "# filter rows and columns\n", + "pr_dt <- dhs_pr_dt[(\n", + " !(is.na(get(kid_age_col))) & # no missing age\n", + " get(kid_age_col) >= 6 & # 6 months or older\n", + " get(kid_age_col) <= 59 & # younger than 5\n", + " get(person_slept_col) == 1 & # slept last night in household\n", + " get(hemoglobin_selection_col) == 1 # household selected for hemoglobin test\n", + " ),\n", + " .SD, .SDcols = c(\n", + " household_id_cols,\n", + " household_sampling_cols,\n", + " hemoglobin_selection_col,\n", + " person_slept_col,\n", + " kid_age_col,\n", + " smear_result_col,\n", + " rdt_result_col)\n", + " ]\n", + "\n", + "pr_dt[, wt := HV005/1000000]\n", + "\n", + "pr_dt <- merge.data.table(dhs_admin_dt, pr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"HV024\", all = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "a5fe638a-7418-4049-822e-6c3d715fdded" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rapid Diagnostic Test Indicator" + ], + "id": "7fa09c3c-d462-42f0-830a-444c79addf86" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "xtabs( ~ get(rdt_result_col), data = pr_dt, addNA = TRUE)\n", + "\n", + "# filter rows\n", + "rdt_dt <- pr_dt[\n", + " get(rdt_result_col) %in% c(0, 1), # tested and had either positive (1) or negative (0) result\n", + " ]\n", + "\n", + "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", + "rdt_design_sampling = svydesign(\n", + " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", + " data = rdt_dt, # dataset\n", + " strata = ~ HV023, # groupings of primary sampling units\n", + " weights = ~ wt, # the sampling weights variable\n", + " num_p=1, # ? dunno what this is\n", + " nest = T # the primary sampling units are nested within the strata\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "af0796b0-6f8e-40a0-8a41-e837ce0b02ee" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}\")\n", + "malaria_rdt_table <- compute_and_export_indicator_table(\n", + " design_obj = rdt_design_sampling,\n", + " indicator_name = rdt_result_col,\n", + " output_indicator_name = indicator_u5prev,\n", + " admin_name_col = admin_name_col,\n", + " admin_data = admin_data,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " filename_without_extension = filename_without_extension\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "028130f5-8522-4a2f-81d9-25630affdab3" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# prevalence table computed and exported in previous cell." + ], + "execution_count": null, + "outputs": [], + "id": "bcef2f47-d0d8-4cea-8477-a192b92bb9a8" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "e975d45b-1e32-442c-a859-d65ad3db904c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "706e8de0-643f-4b68-a9e7-8c9dcd7bfba7" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# already merged and exported above" + ], + "execution_count": null, + "outputs": [], + "id": "be99a596-2647-4399-979f-4fd5855bd7cf" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "d8d7236b-bf86-4da4-bc15-ca97b57b5ba4" + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "fc27d9c1-0c0c-46df-9508-57add133acaf", - "metadata": {}, - "source": [ - "## Resources" - ] - }, - { - "cell_type": "markdown", - "id": "9cf1724b-b01c-4b2e-93f7-d0f54cfa4850", - "metadata": {}, - "source": [ - "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Prevalence_of_Malaria_in_Children.htm%23Percentage_of_children22bc-1&rhtocid=_15_13_0\n", - "\n", - "Numerators:\n", - "1) Number of de facto children tested using RDT who are positive for malaria (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 = 1)\n", - "2) Number of de facto children tested using microscopy who are positive for malaria (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 = 1)\n", - " \n", - "Denominators:\n", - "a) Number of de facto children tested using RDT (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 in 0,1)\n", - "b) Number of de facto children tested using microscopy (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 in 0,1,6)" - ] - }, - { - "cell_type": "markdown", - "id": "7c61058a-361f-4992-9f2b-99a82f798fd8", - "metadata": {}, - "source": [ - "**Project uses RDT**" - ] - }, - { - "cell_type": "markdown", - "id": "d0e715b0-9a8d-4d15-b7ef-486f16fec73b", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a3cc2680-f9cc-46cd-9565-2b8af14fc29a", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)\n", - "\n", - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'prevalence')\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n" - ] - }, - { - "cell_type": "markdown", - "id": "43135a7a-f1b8-4a89-9db1-9012c3369f3d", - "metadata": {}, - "source": [ - "## Get data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e83a6c1-cd47-4eeb-b949-7bef7d1f36c6", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a623578-170f-42e0-a012-78f0ddbcce87", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}\n", - "\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)" - ] - }, - { - "cell_type": "markdown", - "id": "11aa09f3-ecc8-46c0-b9f3-95c0520202ef", - "metadata": {}, - "source": [ - "### Import " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c015490b-a8fc-471d-a83f-0dce8e010cef", - "metadata": {}, - "outputs": [], - "source": [ - "indicator_u5prev <- 'PCT_U5_PREV_RDT' # to be computed\n", - "\n", - "data_source <- 'DHS'\n", - "household_recode <- 'HR'\n", - "person_recode <- 'PR'\n", - "target_file_type <- 'SV'\n", - "\n", - "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", - "\n", - "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "dhs_pr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, person_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_pr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "# # Remove existing output files\n", - "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "# files_to_delete <- files[grepl('U5_PREV', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", - "# file.remove(files_to_delete)\n", - "\n", - "data_extension <- '.SAV'\n", - "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "dhs_pr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", person_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "\n", - "if(!check_dhs_same_version(dhs_hr_filename, dhs_pr_filename)){\n", - " stop(\"The input DHS data do not have the same version/issue. Check available data before rerunning.\")\n", - "}\n", - "\n", - "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", - "dhs_hr_dt <- setDT(dhs_hr_dt)\n", - "\n", - "dhs_pr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_pr_filename)) # person recode\n", - "dhs_pr_dt <- setDT(dhs_pr_dt)" - ] - }, - { - "cell_type": "markdown", - "id": "8b191b87-3694-4883-99c9-66ade3477f8e", - "metadata": {}, - "source": [ - "### Make admin dataframe (for future merging)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0767159-1660-48c3-bdaf-f8be3643e039", - "metadata": {}, - "outputs": [], - "source": [ - "dhs_admin_dt <- make_dhs_admin_df(\n", - " input_dhs_df=dhs_hr_dt,\n", - " original_admin_column=\"HV024\",\n", - " new_admin_name_colname=admin_name_col,\n", - " new_admin_code_colname='DHS_ADM1_CODE'\n", - ")\n", - "\n", - "# format the names to be like DHIS2 names\n", - "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", - "\n", - "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", - "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", - "\n", - "# Check that all regions can be matched with DHIS2 pyramid\n", - "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", - " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", - "}\n", - "\n", - "rm(dhs_hr_dt) # free up resources" - ] - }, - { - "cell_type": "markdown", - "id": "8a47d5f6-f8d8-4373-9cc8-a776d85b8a75", - "metadata": {}, - "source": [ - "### Preprocess" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f49ecb55-da48-4cf5-aa2b-7691eed04b77", - "metadata": {}, - "outputs": [], - "source": [ - "# Relevant columns\n", - "household_id_cols <- c(\"HHID\", \"HV000\", \"HV001\", \"HV002\")\n", - "household_sampling_cols <- c(\"HV005\", \"HV021\", \"HV022\", \"HV023\", \"HV024\")\n", - "hemoglobin_selection_col = \"HV042\"\n", - "person_slept_col <- grep(\"^HV103\", names(dhs_pr_dt), value = TRUE)\n", - "kid_age_col <- \"HC1\"\n", - "smear_result_col <- \"HML32\" # smear test (GE)\n", - "rdt_result_col <- \"HML35\" # rapid diagnostic test (RDT / TDR)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5fe638a-7418-4049-822e-6c3d715fdded", - "metadata": {}, - "outputs": [], - "source": [ - "# sapply(kid_age_cols, function(i) table(dhs_pr_dt[[i]], useNA = 'always'))\n", - "\n", - "# table(dhs_pr_dt$HC1, useNA = 'ifany')\n", - "# table(dhs_pr_dt$HV103, useNA = 'ifany')\n", - "# table(dhs_pr_dt$HV042, useNA = 'ifany')\n", - "\n", - "# filter rows and columns\n", - "pr_dt <- dhs_pr_dt[(\n", - " !(is.na(get(kid_age_col))) & # no missing age\n", - " get(kid_age_col) >= 6 & # 6 months or older\n", - " get(kid_age_col) <= 59 & # younger than 5\n", - " get(person_slept_col) == 1 & # slept last night in household\n", - " get(hemoglobin_selection_col) == 1 # household selected for hemoglobin test\n", - " ),\n", - " .SD, .SDcols = c(\n", - " household_id_cols,\n", - " household_sampling_cols,\n", - " hemoglobin_selection_col,\n", - " person_slept_col,\n", - " kid_age_col,\n", - " smear_result_col,\n", - " rdt_result_col)\n", - " ]\n", - "\n", - "pr_dt[, wt := HV005/1000000]\n", - "\n", - "pr_dt <- merge.data.table(dhs_admin_dt, pr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"HV024\", all = TRUE)" - ] - }, - { - "cell_type": "markdown", - "id": "7fa09c3c-d462-42f0-830a-444c79addf86", - "metadata": {}, - "source": [ - "## Rapid Diagnostic Test Indicator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af0796b0-6f8e-40a0-8a41-e837ce0b02ee", - "metadata": {}, - "outputs": [], - "source": [ - "xtabs( ~ get(rdt_result_col), data = pr_dt, addNA = TRUE)\n", - "\n", - "# filter rows\n", - "rdt_dt <- pr_dt[\n", - " get(rdt_result_col) %in% c(0, 1), # tested and had either positive (1) or negative (0) result\n", - " ]\n", - "\n", - "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", - "rdt_design_sampling = svydesign(\n", - " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", - " data = rdt_dt, # dataset\n", - " strata = ~ HV023, # groupings of primary sampling units\n", - " weights = ~ wt, # the sampling weights variable\n", - " num_p=1, # ? dunno what this is\n", - " nest = T # the primary sampling units are nested within the strata\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "028130f5-8522-4a2f-81d9-25630affdab3", - "metadata": {}, - "outputs": [], - "source": [ - "malaria_rdt_table <- svyby(formula = as.formula(paste(\"~\", rdt_result_col)), # to dynamically set the target colname\n", - " # by = ~ ADM1,\n", - " by = reformulate(admin_name_col), # to dynamically define the grouping colname\n", - " FUN = svymean, # compute survey mean\n", - " design = rdt_design_sampling, # the weights, strata, clusters\n", - " level = 0.95, # the level for CI's\n", - " vartype = \"ci\", # for variance, use the CI's\n", - " na.rm = TRUE, # remove the NA's in the calculation\n", - " influence = TRUE) # which observations have a substantial change in the results of the analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bcef2f47-d0d8-4cea-8477-a192b92bb9a8", - "metadata": {}, - "outputs": [], - "source": [ - "setDT(malaria_rdt_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e975d45b-1e32-442c-a859-d65ad3db904c", - "metadata": {}, - "outputs": [], - "source": [ - "lower_bound_col <- glue(\"{toupper(indicator_u5prev)}_CI_LOWER_BOUND\")\n", - "upper_bound_col <- glue(\"{toupper(indicator_u5prev)}_CI_UPPER_BOUND\")\n", - "sample_avg_col <- glue(\"{toupper(indicator_u5prev)}_SAMPLE_AVERAGE\")\n", - "\n", - "# names(malaria_rdt_table) <- toupper(names(malaria_rdt_table))\n", - "names(malaria_rdt_table)[names(malaria_rdt_table) == 'ci_l'] <- lower_bound_col\n", - "names(malaria_rdt_table)[names(malaria_rdt_table) == 'ci_u'] <- upper_bound_col\n", - "names(malaria_rdt_table)[names(malaria_rdt_table) == rdt_result_col] <- sample_avg_col\n", - "\n", - "# Cap the CI's between 0 and 1 (in case of small sample => large CI's)\n", - "malaria_rdt_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - "malaria_rdt_table[get(upper_bound_col) > 1, (upper_bound_col) := 1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "706e8de0-643f-4b68-a9e7-8c9dcd7bfba7", - "metadata": {}, - "outputs": [], - "source": [ - "# Convert to percentages\n", - "malaria_rdt_table[, (lower_bound_col) := get(lower_bound_col) * 100]\n", - "malaria_rdt_table[, (upper_bound_col) := get(upper_bound_col) * 100]\n", - "malaria_rdt_table[, (sample_avg_col) := get(sample_avg_col) * 100]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be99a596-2647-4399-979f-4fd5855bd7cf", - "metadata": {}, - "outputs": [], - "source": [ - "malaria_rdt_table <- merge.data.table(admin_data, malaria_rdt_table, by = admin_name_col, all = TRUE)\n", - "\n", - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}\")\n", - "write.csv(malaria_rdt_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", - "write_parquet(malaria_rdt_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d8d7236b-bf86-4da4-bc15-ca97b57b5ba4", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb index 8813975..4e8d453 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb @@ -1,604 +1,649 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "4a75d418-3144-427f-8fa3-2cb7f727e66d", - "metadata": {}, - "source": [ - "# DTP Vaccination rates and attrition using DHS data" - ] - }, - { - "cell_type": "markdown", - "id": "6b666c7a-f105-4fad-aea0-d23a38fa0153", - "metadata": {}, - "source": [ - "## Preliminaries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dbcf1df6-c264-48b3-bf53-f13a4b036487", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "449ea786-ac70-4513-aaf5-24ffe69aec5c", - "metadata": {}, - "outputs": [], - "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9037624-3ac0-403c-9ea0-fc78891c2393", - "metadata": {}, - "outputs": [], - "source": [ - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'vaccination')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f13522e-a0e5-44b7-b8f5-b66dc7ca37c0", - "metadata": {}, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"readr\", \"haven\", \"glue\", \"survey\", \"data.table\", \"sf\", \"ggplot2\", \"stringi\", \"reticulate\", \"jsonlite\", \"httr\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e32d9589-1a34-4a62-ac93-ae04e5939eb1", - "metadata": {}, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d0221bb-9d59-42ff-a6c9-09c735083135", - "metadata": {}, - "outputs": [], - "source": [ - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d9196629-0d55-416c-9171-4ebc630cc93b", - "metadata": {}, - "outputs": [], - "source": [ - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "markdown", - "id": "ff080767-9fdb-4790-988c-e2b4c4f7226f", - "metadata": {}, - "source": [ - "## Geo data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5249483e-2b07-425d-b385-bc32ac601ced", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1308ee0c-4856-445f-b518-ee0f4497c9b5", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - " )\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# DRC provinces need to be cleaned\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}\n", - "\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)" - ] - }, - { - "cell_type": "markdown", - "id": "ba6c6b6b-e0fd-434a-8e44-6c45bea47d97", - "metadata": {}, - "source": [ - "## Import DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29d4866a-5f98-404e-aaa8-6fe1e6dc9e2d", - "metadata": {}, - "outputs": [], - "source": [ - "vaccination_doses <- c(1, 2, 3)\n", - "indicator_access <- 'PCT_DTP'\n", - "indicator_attrition <- 'PCT_DROPOUT_DTP'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6bd9ec2-cdcb-41d9-8cd7-54096bbfa2a3", - "metadata": {}, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "household_recode <- 'HR'\n", - "kid_recode <- 'KR'\n", - "target_file_type <- 'SV'\n", - "\n", - "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5591f0f-fe5f-4c05-9368-2d1c7e388782", - "metadata": {}, - "outputs": [], - "source": [ - "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "dhs_kr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, kid_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_kr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "# # Remove existing output files\n", - "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "# files_to_delete <- files[grepl('DTP', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", - "# file.remove(files_to_delete)\n", - "\n", - "data_extension <- '.SAV'\n", - "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "dhs_kr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", kid_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "\n", - "if(!check_dhs_same_version(dhs_hr_filename, dhs_kr_filename)){\n", - " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", - "}\n", - "\n", - "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", - "dhs_hr_dt <- setDT(dhs_hr_dt)\n", - "\n", - "dhs_kr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_kr_filename)) # kid recode\n", - "dhs_kr_dt <- setDT(dhs_kr_dt)" - ] - }, - { - "cell_type": "markdown", - "id": "51ccd171-5056-42bc-a45f-ea70bf2c3bbe", - "metadata": {}, - "source": [ - "## Preprocess DHS data" - ] - }, - { - "cell_type": "markdown", - "id": "d6607265-c852-48b0-852b-e8fecaed804b", - "metadata": {}, - "source": [ - "### Extract DHS admin data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6fb209c6-8090-40f0-a101-fbfbeb17de41", - "metadata": {}, - "outputs": [], - "source": [ - "# Make admin codes and names dataframe (for future merging)\n", - "\n", - "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", - "\n", - "dhs_admin_dt <- make_dhs_admin_df(\n", - " input_dhs_df=dhs_hr_dt,\n", - " original_admin_column=\"HV024\",\n", - " new_admin_name_colname=admin_name_col,\n", - " new_admin_code_colname='DHS_ADM1_CODE'\n", - ")\n", - "\n", - "# format the names to be like DHIS2 names\n", - "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", - "\n", - "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", - "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", - "\n", - "# Check that all regions can be matched with DHIS2 pyramid\n", - "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", - " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", - "}\n", - "\n", - "rm(dhs_hr_dt) # free up resources" - ] - }, - { - "cell_type": "markdown", - "id": "ae09cc05-a942-4e56-be52-7cc5876f62a9", - "metadata": {}, - "source": [ - "### Filter rows and columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b852bc5-fc91-45d6-a668-0efb25b00092", - "metadata": {}, - "outputs": [], - "source": [ - "# remove dead children from the dataset, keep only children aged 1 or more (avoid left censoring for vaccination) and respect the base for the 'h' variables\n", - "kr_dt <- dhs_kr_dt[B5 == 1 & B8 >= 1 & B19 < 36,]\n", - "\n", - "household_id_cols <- c('V000', 'V001', 'V002')\n", - "kid_id_cols <- c('CASEID', 'BIDX')\n", - "kid_dpt1_cols <- c('H3', 'H3D', 'H3M', 'H3Y')\n", - "kid_dpt2_cols <- c('H5', 'H5D', 'H5M', 'H5Y')\n", - "kid_dpt3_cols <- c('H7', 'H7D', 'H7M', 'H7Y')\n", - "kid_sampling_cols <- c('V005', 'V021', 'V023', 'V024')\n", - "\n", - "kr_dt <- kr_dt[, .SD, .SDcols = c(household_id_cols, kid_id_cols, kid_sampling_cols, kid_dpt1_cols, kid_dpt2_cols, kid_dpt3_cols)]\n", - "\n", - "# # check i didn't omit any crucial variable\n", - "# stopifnot(nrow(kr_dt[duplicated(kr_dt)]) == 0)" - ] - }, - { - "cell_type": "markdown", - "id": "a0194b86-ac85-44b5-bdc0-d7dd46b2170a", - "metadata": {}, - "source": [ - "### New features" - ] - }, - { - "cell_type": "markdown", - "id": "fb4ad225-93bc-44f7-bc87-d16f9eb2f065", - "metadata": {}, - "source": [ - "Add the region labels, to subsequently match DHIS2 data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e63a49a0-273a-4cef-ac91-2f95acf141dd", - "metadata": {}, - "outputs": [], - "source": [ - "kr_dt <- merge.data.table(dhs_admin_dt, kr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"V024\", all = TRUE)" - ] - }, - { - "cell_type": "markdown", - "id": "f578eda9-0ae6-4366-9aa3-a3a110e19d30", - "metadata": {}, - "source": [ - "Create the target features (whether or not the kid was vaccinated, for each dose)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e6d586f-636d-4b5c-9ccb-31520e4ca62e", - "metadata": {}, - "outputs": [], - "source": [ - "# Create dummy variables for the various DTP vaccine doses\n", - "kr_dt[, `:=`(\n", - " DTP1 = fcase(\n", - " H3 == 0L, 0L,\n", - " H3 %in% c(1L, 2L, 3L), 1L,\n", - " default = NA\n", - " ),\n", - " DTP2 = fcase(\n", - " H5 == 0L, 0L,\n", - " H5 %in% c(1L, 2L, 3L), 1L,\n", - " default = NA\n", - " ),\n", - " DTP3 = fcase(\n", - " H7 == 0L, 0L,\n", - " H7 %in% c(1L, 2L, 3L), 1L,\n", - " default = NA\n", - " )\n", - ")]\n", - "\n", - "# Correct external consistency issues: children who got the third dose also had the second, and so on:\n", - "kr_dt[DTP2 == 1, DTP1 := 1]\n", - "kr_dt[DTP3 == 1, DTP1 := 1]\n", - "kr_dt[DTP3 == 1, DTP2 := 1]" - ] - }, - { - "cell_type": "markdown", - "id": "f11d02d8-1c0f-4cc0-9bc6-736a84d73a23", - "metadata": {}, - "source": [ - "### Create the survey design" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dad05a65-9967-4c4f-9522-ba6cd4cfef7f", - "metadata": {}, - "outputs": [], - "source": [ - "# compute the household/kid weights\n", - "kr_dt[, wt := V005/1000000]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db28fe66-7e99-4d98-8ac1-0ea7ba30c677", - "metadata": {}, - "outputs": [], - "source": [ - "# account for the sampling strategy (clustering, stratification, weights) for means, proportions, regression models, etc.\n", - "dtp_design = svydesign(\n", - " ids = ~ V021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", - " data = kr_dt, # dataset\n", - " strata = ~ V023, # groupings of primary sampling units\n", - " weights = ~ wt, # the sampling weights variable\n", - " nest = T # the primary sampling units are nested within the strata\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "44ece144-6678-4de2-a1d4-f924b6549345", - "metadata": {}, - "source": [ - "## Vaccination proportion indicator" - ] - }, - { - "cell_type": "markdown", - "id": "a2034bea-e0fd-4268-b383-4d39d9cd7e75", - "metadata": {}, - "source": [ - "For each vaccine dose:\n", - "- compute the proportions of vaccinated per region\n", - "- compute the CIs\n", - "- add the admin units and save to .csv and parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1c19f83-a1e1-4b1d-bc26-605e4ed4fa06", - "metadata": {}, - "outputs": [], - "source": [ - "# create the empty dropout table to add each proportion and compute attrition rates\n", - "DTP_DROPOUT <- copy(admin_data)\n", - "\n", - "for (dose_number in vaccination_doses){\n", - " table_name <- glue(\"{toupper(indicator_access)}{dose_number}\")\n", - " vaccine_colname <- glue(\"DTP{dose_number}\")\n", - " \n", - " computed_proportions <- svyby(\n", - " # formula = ~ get(vaccine_colname),\n", - " as.formula(paste(\"~\", vaccine_colname)),\n", - " # by = ~ ADM1,\n", - " by = reformulate(admin_name_col),\n", - " FUN = svymean,\n", - " design = dtp_design,\n", - " level = 0.95,\n", - " vartype = \"ci\",\n", - " na.rm = TRUE,\n", - " influence = TRUE\n", - " )\n", - " \n", - " # change the name of the target column (to avoid the 'get' in the name)\n", - " names(computed_proportions)[2] <- vaccine_colname\n", - " \n", - " # assign the table value to the table name\n", - " assign(table_name, computed_proportions)\n", - " \n", - " # change the names of the columns\n", - " lower_bound_col <- glue(\"{toupper(indicator_access)}{dose_number}_CI_LOWER_BOUND\")\n", - " upper_bound_col <- glue(\"{toupper(indicator_access)}{dose_number}_CI_UPPER_BOUND\")\n", - " sample_avg_col <- glue(\"{toupper(indicator_access)}{dose_number}_SAMPLE_AVERAGE\") \n", - " \n", - " # retrieve data, modify colnames, and reassign\n", - " df <- get(table_name)\n", - " names(df)[names(df) == 'ci_l'] <- lower_bound_col\n", - " names(df)[names(df) == 'ci_u'] <- upper_bound_col\n", - " names(df)[names(df) == vaccine_colname] <- sample_avg_col\n", - " setDT(df)\n", - "\n", - " # Cap the CI's between 0 and 1 (in case of small sample => large CI's)\n", - " df[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - " df[get(upper_bound_col) > 1, (upper_bound_col) := 1]\n", - "\n", - " # Convert to percentages\n", - " df[, (lower_bound_col) := get(lower_bound_col) * 100]\n", - " df[, (upper_bound_col) := get(upper_bound_col) * 100]\n", - " df[, (sample_avg_col) := get(sample_avg_col) * 100]\n", - " \n", - " # add the admin units\n", - " df <- merge.data.table(admin_data, df, by = admin_name_col, all.x = TRUE)\n", - " \n", - " # write to file\n", - " filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{table_name}\")\n", - " fwrite(df, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))\n", - " write_parquet(df, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))\n", - " \n", - " # add current dose table to the summary table (for future computation of dropout rates)\n", - " DTP_DROPOUT <- merge.data.table(DTP_DROPOUT, df, by = admin_cols)\n", - "}\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "d4e56da8-3ed5-4791-913c-58b76fbba125", - "metadata": {}, - "source": [ - "## Dropout rate indicator" - ] - }, - { - "cell_type": "markdown", - "id": "375952b4-4a25-435c-aae1-c53c54e9382c", - "metadata": {}, - "source": [ - "Add dropout rates plots: for each vaccine dose:\n", - "- make the dropout rates\n", - "- add them to the summary file and save it as .csv and parquet\n", - "- make plots and save them" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d21baeab-f1b9-441d-b073-e7dc193a7535", - "metadata": {}, - "outputs": [], - "source": [ - "# remove the CI columns (as requested)\n", - "DTP_DROPOUT[, grep(\"BOUND\", names(DTP_DROPOUT), value = TRUE) := NULL]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ed9b8475-bc2e-4170-b5b5-034909840269", - "metadata": {}, - "outputs": [], - "source": [ - "for(current_dose in vaccination_doses){\n", - " for (reference_dose in 1:(current_dose - 1)){\n", - " if((reference_dose >= 1) & (reference_dose < current_dose)){\n", - " attrition_col <- glue(\"{toupper(indicator_attrition)}_{reference_dose}_{current_dose}\")\n", - " print(glue('Computing attrition for {attrition_col}'))\n", - " numerator_colname <- glue(\"{toupper(indicator_access)}{current_dose}_SAMPLE_AVERAGE\")\n", - " denominator_colname <- glue(\"{toupper(indicator_access)}{reference_dose}_SAMPLE_AVERAGE\")\n", - " DTP_DROPOUT[, (attrition_col) := (1 - get(numerator_colname) / get(denominator_colname))*100] # percentages instead of rates, as requested\n", - " }\n", - " }\n", - "}\n", - "\n", - "# remove the unnecessary columns\n", - "DTP_DROPOUT[, grep(\"SAMPLE_AVERAGE\", names(DTP_DROPOUT), value = TRUE) := NULL]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2369423c-8752-4c92-ad84-44d2120be6a9", - "metadata": {}, - "outputs": [], - "source": [ - "dtp_dropout_filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_attrition}\")\n", - "fwrite(DTP_DROPOUT, file = file.path(OUTPUT_DATA_PATH, paste0(dtp_dropout_filename_without_extension, \".csv\")))\n", - "write_parquet(DTP_DROPOUT, file.path(OUTPUT_DATA_PATH, paste0(dtp_dropout_filename_without_extension, \".parquet\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ecc63839-393c-4487-bdfb-e9e96626cc71", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "4a75d418-3144-427f-8fa3-2cb7f727e66d", + "metadata": {}, + "source": [ + "# DTP Vaccination rates and attrition using DHS data" + ] + }, + { + "cell_type": "markdown", + "id": "6b666c7a-f105-4fad-aea0-d23a38fa0153", + "metadata": {}, + "source": [ + "## Preliminaries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbcf1df6-c264-48b3-bf53-f13a4b036487", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "449ea786-ac70-4513-aaf5-24ffe69aec5c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9037624-3ac0-403c-9ea0-fc78891c2393", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'vaccination')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f13522e-a0e5-44b7-b8f5-b66dc7ca37c0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_indicator_tables.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"readr\", \"haven\", \"glue\", \"survey\", \"data.table\", \"sf\", \"ggplot2\", \"stringi\", \"reticulate\", \"jsonlite\", \"httr\", \"arrow\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e32d9589-1a34-4a62-ac93-ae04e5939eb1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d0221bb-9d59-42ff-a6c9-09c735083135", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9196629-0d55-416c-9171-4ebc630cc93b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" + ] + }, + { + "cell_type": "markdown", + "id": "ff080767-9fdb-4790-988c-e2b4c4f7226f", + "metadata": {}, + "source": [ + "## Geo data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5249483e-2b07-425d-b385-bc32ac601ced", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1308ee0c-4856-445f-b518-ee0f4497c9b5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + " )\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# DRC provinces need to be cleaned\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}\n", + "\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)" + ] + }, + { + "cell_type": "markdown", + "id": "ba6c6b6b-e0fd-434a-8e44-6c45bea47d97", + "metadata": {}, + "source": [ + "## Import DHS data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29d4866a-5f98-404e-aaa8-6fe1e6dc9e2d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "vaccination_doses <- c(1, 2, 3)\n", + "indicator_access <- 'PCT_DTP'\n", + "indicator_attrition <- 'PCT_DROPOUT_DTP'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6bd9ec2-cdcb-41d9-8cd7-54096bbfa2a3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "data_source <- 'DHS'\n", + "household_recode <- 'HR'\n", + "kid_recode <- 'KR'\n", + "target_file_type <- 'SV'\n", + "\n", + "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5591f0f-fe5f-4c05-9368-2d1c7e388782", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "dhs_kr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, kid_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_kr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "# # Remove existing output files\n", + "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "# files_to_delete <- files[grepl('DTP', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", + "# file.remove(files_to_delete)\n", + "\n", + "data_extension <- '.SAV'\n", + "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "dhs_kr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", kid_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "\n", + "if(!check_dhs_same_version(dhs_hr_filename, dhs_kr_filename)){\n", + " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", + "}\n", + "\n", + "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", + "dhs_hr_dt <- setDT(dhs_hr_dt)\n", + "\n", + "dhs_kr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_kr_filename)) # kid recode\n", + "dhs_kr_dt <- setDT(dhs_kr_dt)" + ] + }, + { + "cell_type": "markdown", + "id": "51ccd171-5056-42bc-a45f-ea70bf2c3bbe", + "metadata": {}, + "source": [ + "## Preprocess DHS data" + ] + }, + { + "cell_type": "markdown", + "id": "d6607265-c852-48b0-852b-e8fecaed804b", + "metadata": {}, + "source": [ + "### Extract DHS admin data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fb209c6-8090-40f0-a101-fbfbeb17de41", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Make admin codes and names dataframe (for future merging)\n", + "\n", + "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", + "\n", + "dhs_admin_dt <- make_dhs_admin_df(\n", + " input_dhs_df=dhs_hr_dt,\n", + " original_admin_column=\"HV024\",\n", + " new_admin_name_colname=admin_name_col,\n", + " new_admin_code_colname='DHS_ADM1_CODE'\n", + ")\n", + "\n", + "# format the names to be like DHIS2 names\n", + "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", + "\n", + "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", + "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", + "\n", + "# Check that all regions can be matched with DHIS2 pyramid\n", + "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", + " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", + "}\n", + "\n", + "rm(dhs_hr_dt) # free up resources" + ] + }, + { + "cell_type": "markdown", + "id": "ae09cc05-a942-4e56-be52-7cc5876f62a9", + "metadata": {}, + "source": [ + "### Filter rows and columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b852bc5-fc91-45d6-a668-0efb25b00092", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# remove dead children from the dataset, keep only children aged 1 or more (avoid left censoring for vaccination) and respect the base for the 'h' variables\n", + "kr_dt <- dhs_kr_dt[B5 == 1 & B8 >= 1 & B19 < 36,]\n", + "\n", + "household_id_cols <- c('V000', 'V001', 'V002')\n", + "kid_id_cols <- c('CASEID', 'BIDX')\n", + "kid_dpt1_cols <- c('H3', 'H3D', 'H3M', 'H3Y')\n", + "kid_dpt2_cols <- c('H5', 'H5D', 'H5M', 'H5Y')\n", + "kid_dpt3_cols <- c('H7', 'H7D', 'H7M', 'H7Y')\n", + "kid_sampling_cols <- c('V005', 'V021', 'V023', 'V024')\n", + "\n", + "kr_dt <- kr_dt[, .SD, .SDcols = c(household_id_cols, kid_id_cols, kid_sampling_cols, kid_dpt1_cols, kid_dpt2_cols, kid_dpt3_cols)]\n", + "\n", + "# # check i didn't omit any crucial variable\n", + "# stopifnot(nrow(kr_dt[duplicated(kr_dt)]) == 0)" + ] + }, + { + "cell_type": "markdown", + "id": "a0194b86-ac85-44b5-bdc0-d7dd46b2170a", + "metadata": {}, + "source": [ + "### New features" + ] + }, + { + "cell_type": "markdown", + "id": "fb4ad225-93bc-44f7-bc87-d16f9eb2f065", + "metadata": {}, + "source": [ + "Add the region labels, to subsequently match DHIS2 data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e63a49a0-273a-4cef-ac91-2f95acf141dd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "kr_dt <- merge.data.table(dhs_admin_dt, kr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"V024\", all = TRUE)" + ] + }, + { + "cell_type": "markdown", + "id": "f578eda9-0ae6-4366-9aa3-a3a110e19d30", + "metadata": {}, + "source": [ + "Create the target features (whether or not the kid was vaccinated, for each dose)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e6d586f-636d-4b5c-9ccb-31520e4ca62e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create dummy variables for the various DTP vaccine doses\n", + "kr_dt[, `:=`(\n", + " DTP1 = fcase(\n", + " H3 == 0L, 0L,\n", + " H3 %in% c(1L, 2L, 3L), 1L,\n", + " default = NA\n", + " ),\n", + " DTP2 = fcase(\n", + " H5 == 0L, 0L,\n", + " H5 %in% c(1L, 2L, 3L), 1L,\n", + " default = NA\n", + " ),\n", + " DTP3 = fcase(\n", + " H7 == 0L, 0L,\n", + " H7 %in% c(1L, 2L, 3L), 1L,\n", + " default = NA\n", + " )\n", + ")]\n", + "\n", + "# Correct external consistency issues: children who got the third dose also had the second, and so on:\n", + "kr_dt[DTP2 == 1, DTP1 := 1]\n", + "kr_dt[DTP3 == 1, DTP1 := 1]\n", + "kr_dt[DTP3 == 1, DTP2 := 1]" + ] + }, + { + "cell_type": "markdown", + "id": "f11d02d8-1c0f-4cc0-9bc6-736a84d73a23", + "metadata": {}, + "source": [ + "### Create the survey design" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dad05a65-9967-4c4f-9522-ba6cd4cfef7f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# compute the household/kid weights\n", + "kr_dt[, wt := V005/1000000]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db28fe66-7e99-4d98-8ac1-0ea7ba30c677", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# account for the sampling strategy (clustering, stratification, weights) for means, proportions, regression models, etc.\n", + "dtp_design = svydesign(\n", + " ids = ~ V021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", + " data = kr_dt, # dataset\n", + " strata = ~ V023, # groupings of primary sampling units\n", + " weights = ~ wt, # the sampling weights variable\n", + " nest = T # the primary sampling units are nested within the strata\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "44ece144-6678-4de2-a1d4-f924b6549345", + "metadata": {}, + "source": [ + "## Vaccination proportion indicator" + ] + }, + { + "cell_type": "markdown", + "id": "a2034bea-e0fd-4268-b383-4d39d9cd7e75", + "metadata": {}, + "source": [ + "For each vaccine dose:\n", + "- compute the proportions of vaccinated per region\n", + "- compute the CIs\n", + "- add the admin units and save to .csv and parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1c19f83-a1e1-4b1d-bc26-605e4ed4fa06", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "vaccination_results <- compute_dtp_indicator_tables(\n", + " dtp_design = dtp_design,\n", + " vaccination_doses = vaccination_doses,\n", + " indicator_access = indicator_access,\n", + " admin_name_col = admin_name_col,\n", + " admin_cols = admin_cols,\n", + " admin_data = admin_data,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_source = data_source,\n", + " admin_level = admin_level\n", + ")\n", + "\n", + "DTP_DROPOUT <- vaccination_results$dtp_dropout\n", + "PCT_DTP1 <- vaccination_results$dose_tables[[\"PCT_DTP1\"]]\n", + "PCT_DTP2 <- vaccination_results$dose_tables[[\"PCT_DTP2\"]]\n", + "PCT_DTP3 <- vaccination_results$dose_tables[[\"PCT_DTP3\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "d4e56da8-3ed5-4791-913c-58b76fbba125", + "metadata": {}, + "source": [ + "## Dropout rate indicator" + ] + }, + { + "cell_type": "markdown", + "id": "375952b4-4a25-435c-aae1-c53c54e9382c", + "metadata": {}, + "source": [ + "Add dropout rates plots: for each vaccine dose:\n", + "- make the dropout rates\n", + "- add them to the summary file and save it as .csv and parquet\n", + "- make plots and save them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d21baeab-f1b9-441d-b073-e7dc193a7535", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# dropout computed and exported in next cell using helper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed9b8475-bc2e-4170-b5b5-034909840269", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "DTP_DROPOUT <- compute_and_export_dtp_dropout(\n", + " dtp_dropout = DTP_DROPOUT,\n", + " vaccination_doses = vaccination_doses,\n", + " indicator_access = indicator_access,\n", + " indicator_attrition = indicator_attrition,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_source = data_source,\n", + " admin_level = admin_level\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2369423c-8752-4c92-ad84-44d2120be6a9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# already exported by compute_and_export_dtp_dropout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecc63839-393c-4487-bdfb-e9e96626cc71", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhs_indicators/reporting/snt_dhs_careseeking_report.ipynb b/pipelines/snt_dhs_indicators/reporting/snt_dhs_careseeking_report.ipynb index 55f03b1..4c0c8fb 100644 --- a/pipelines/snt_dhs_indicators/reporting/snt_dhs_careseeking_report.ipynb +++ b/pipelines/snt_dhs_indicators/reporting/snt_dhs_careseeking_report.ipynb @@ -1,315 +1,323 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "33d7a9b4-8e3f-4ff5-8369-88dccd3f6d8c", - "metadata": {}, - "source": [ - "# Plots for careseeking behavior upon child fever (DHS data)" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "33d7a9b4-8e3f-4ff5-8369-88dccd3f6d8c", + "metadata": {}, + "source": [ + "# Plots for careseeking behavior upon child fever (DHS data)" + ] + }, + { + "cell_type": "markdown", + "id": "0ef48ace-d77e-49bf-9b21-1cece3d48161", + "metadata": {}, + "source": [ + "## Preliminary steps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4156fe8-631a-4012-8c66-08dc8a721851", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20a367b8-844b-41a7-8725-4bd37bda0352", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "\n", + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')\n", + "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d81fb691-698d-4832-9137-f8c6f0c5938c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_indicator_tables.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"sf\", \"glue\", \"data.table\", \"ggplot2\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2ab76ca-5e05-4867-af1f-ade9d540a1b9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "print(paste(\"Country code: \", COUNTRY_CODE))" + ] + }, + { + "cell_type": "markdown", + "id": "484aed3b-0c40-40f4-8f81-0c4ed16a5d49", + "metadata": {}, + "source": [ + "## Geo data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d326034-33ff-40d6-a860-81d0bd5a1c34", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0b405fa-f9aa-40b4-b266-54d1de4b2317", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "260e3c90-a72c-4683-acc9-d3ed3d7ac516", + "metadata": {}, + "source": [ + "## DHS tables/names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b95b6df7-2781-4ec5-9c5b-fe79227268de", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "data_source <- 'DHS'\n", + "# indicator_public_care <- 'PUBLIC_CARE'\n", + "# indicator_private_care <- 'PRIVATE_CARE'\n", + "# indicator_no_care <- 'NO_CARE'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d5b1eae-4933-45e1-9cb8-981163cd1369", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_PCT_CARESEEKING_SAMPLE_AVERAGE\")\n", + "careseeking_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))\n", + "\n", + "# all columns which are not admin columns, are indicator columns\n", + "all_indicators <- setdiff(names(careseeking_table), admin_cols)" + ] + }, + { + "cell_type": "markdown", + "id": "530e1567-2820-4a91-bd97-68af0df7aa4c", + "metadata": {}, + "source": [ + "## Maps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0120c63f-27ab-421b-b635-1e538295c466", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "plot_data = merge(spatial_data, careseeking_table, by = admin_cols, all = TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "071dd126-b29d-4f67-b1a2-89b5c09a43e7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "export_careseeking_reporting_plots(\n", + " plot_data = plot_data,\n", + " all_indicators = all_indicators,\n", + " output_plots_path = OUTPUT_PLOTS_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_source = data_source,\n", + " admin_level = admin_level\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f42cc870-2d67-450d-9fd7-616cfb099eb2", + "metadata": {}, + "source": [ + "## Confidence interval plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff3157da-2816-40cc-9d73-f53849947fe9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "export_careseeking_reporting_ci_plots(\n", + " all_indicators = all_indicators,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " output_plots_path = OUTPUT_PLOTS_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_source = data_source,\n", + " admin_level = admin_level,\n", + " admin_name_col = admin_name_col\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eefeb57-1a36-45a9-a462-a4fe098e3bda", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "0ef48ace-d77e-49bf-9b21-1cece3d48161", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4156fe8-631a-4012-8c66-08dc8a721851", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20a367b8-844b-41a7-8725-4bd37bda0352", - "metadata": {}, - "outputs": [], - "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')\n", - "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d81fb691-698d-4832-9137-f8c6f0c5938c", - "metadata": {}, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"sf\", \"glue\", \"data.table\", \"ggplot2\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2ab76ca-5e05-4867-af1f-ade9d540a1b9", - "metadata": {}, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "print(paste(\"Country code: \", COUNTRY_CODE))" - ] - }, - { - "cell_type": "markdown", - "id": "484aed3b-0c40-40f4-8f81-0c4ed16a5d49", - "metadata": {}, - "source": [ - "## Geo data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d326034-33ff-40d6-a860-81d0bd5a1c34", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0b405fa-f9aa-40b4-b266-54d1de4b2317", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "260e3c90-a72c-4683-acc9-d3ed3d7ac516", - "metadata": {}, - "source": [ - "## DHS tables/names" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b95b6df7-2781-4ec5-9c5b-fe79227268de", - "metadata": {}, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "# indicator_public_care <- 'PUBLIC_CARE'\n", - "# indicator_private_care <- 'PRIVATE_CARE'\n", - "# indicator_no_care <- 'NO_CARE'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d5b1eae-4933-45e1-9cb8-981163cd1369", - "metadata": {}, - "outputs": [], - "source": [ - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_PCT_CARESEEKING_SAMPLE_AVERAGE\")\n", - "careseeking_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))\n", - "\n", - "# all columns which are not admin columns, are indicator columns\n", - "all_indicators <- setdiff(names(careseeking_table), admin_cols)" - ] - }, - { - "cell_type": "markdown", - "id": "530e1567-2820-4a91-bd97-68af0df7aa4c", - "metadata": {}, - "source": [ - "## Maps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0120c63f-27ab-421b-b635-1e538295c466", - "metadata": {}, - "outputs": [], - "source": [ - "plot_data = merge(spatial_data, careseeking_table, by = admin_cols, all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "071dd126-b29d-4f67-b1a2-89b5c09a43e7", - "metadata": {}, - "outputs": [], - "source": [ - "for (indicator_name in all_indicators){\n", - " \n", - " plot_label = gsub('PCT ', '', gsub('_', ' ', indicator_name))\n", - "\n", - " indicator_plot <- make_dhs_map(\n", - " plot_dt = plot_data,\n", - " plot_colname = indicator_name,\n", - " title_name = glue(\"Percentage children: {plot_label}\"),\n", - " legend_title = glue(\"%\"),\n", - " scale_limits = c(0, 100)\n", - " )\n", - " # indicator_plot <- ggplot(plot_data) +\n", - " # geom_sf(aes(fill = get(indicator_name))) +\n", - " # # geom_sf(aes(fill = U5_PREV_RDT_SAMPLE_AVERAGE)) +\n", - " # scale_fill_gradient(\n", - " # limits = c(0,1),\n", - " # low = \"white\",\n", - " # high = \"navy\",\n", - " # na.value = \"grey90\"\n", - " # ) +\n", - " # coord_sf() + # map projection\n", - " # theme_classic() +\n", - " # theme(plot.title = element_text(face = \"bold\", hjust = 0.5),\n", - " # legend.position = \"bottom\", legend.key.width = unit(2,\"cm\"), legend.text=element_text(size=10)) +\n", - " # labs(fill = glue(\"Percentage {plot_label}\"))\n", - " \n", - " # print(indicator_plot)\n", - " ggsave(indicator_plot, file = file.path(OUTPUT_PLOTS_PATH, glue('{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_name)}_plot.png')), dpi = 500)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "f42cc870-2d67-450d-9fd7-616cfb099eb2", - "metadata": {}, - "source": [ - "## Confidence interval plots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff3157da-2816-40cc-9d73-f53849947fe9", - "metadata": {}, - "outputs": [], - "source": [ - "for (indicator_name in all_indicators){\n", - " \n", - " indicator_label <- gsub('_', ' ', indicator_name)\n", - " \n", - " ci_data <- fread(file.path(\n", - " OUTPUT_DATA_PATH,\n", - " glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_name}.csv\")\n", - " ))\n", - " \n", - " sample_avg_col <- glue(\"{indicator_name}_SAMPLE_AVERAGE\")\n", - " lower_bound_col <- glue(\"{indicator_name}_CI_LOWER_BOUND\")\n", - " upper_bound_col <- glue(\"{indicator_name}_CI_UPPER_BOUND\")\n", - " ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} {indicator_label} CI\")\n", - " ci_plot_xlab <- admin_name_col\n", - " ci_plot_ylab <- glue(\"{indicator_label} (%)\")\n", - " ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_name)}_CI_plot.png\")\n", - " \n", - " ci_plot <- make_ci_plot(\n", - " df_to_plot=ci_data,\n", - " admin_colname=admin_name_col,\n", - " point_estimation_colname=sample_avg_col,\n", - " ci_lower_colname=lower_bound_col,\n", - " ci_upper_colname=upper_bound_col,\n", - " title_name=ci_plot_title,\n", - " x_title=ci_plot_xlab,\n", - " y_title=ci_plot_ylab\n", - " )\n", - " \n", - " ggsave(plot=ci_plot, filename=file.path(OUTPUT_PLOTS_PATH, ci_plot_filename), width = 8, height = 6, dpi = 300)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5eefeb57-1a36-45a9-a462-a4fe098e3bda", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_careseeking_computation.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_careseeking_computation.r new file mode 100644 index 0000000..145ea1c --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_careseeking_computation.r @@ -0,0 +1,71 @@ +compute_careseeking_indicators <- function( + kr_design_sampling, + indicator_names, + admin_name_col, + admin_data, + output_data_path, + country_code, + data_source, + admin_level +) { + summary_table <- data.table::copy(admin_data) + indicator_tables <- list() + + for (indicator_name in indicator_names) { + table_content <- survey::svyby( + formula = as.formula(paste("~", indicator_name)), + by = reformulate(admin_name_col), + FUN = survey::svymean, + design = kr_design_sampling, + level = 0.95, + vartype = "ci", + na.rm = TRUE, + influence = TRUE + ) + + data.table::setDT(table_content) + lower_bound_col <- glue::glue("{toupper(indicator_name)}_CI_LOWER_BOUND") + upper_bound_col <- glue::glue("{toupper(indicator_name)}_CI_UPPER_BOUND") + sample_avg_col <- glue::glue("{toupper(indicator_name)}_SAMPLE_AVERAGE") + + names(table_content)[names(table_content) == "ci_l"] <- lower_bound_col + names(table_content)[names(table_content) == "ci_u"] <- upper_bound_col + names(table_content)[names(table_content) == indicator_name] <- sample_avg_col + + table_content[get(lower_bound_col) < 0, (lower_bound_col) := 0] + table_content[get(upper_bound_col) > 1, (upper_bound_col) := 1] + table_content[, (lower_bound_col) := get(lower_bound_col) * 100] + table_content[, (upper_bound_col) := get(upper_bound_col) * 100] + table_content[, (sample_avg_col) := get(sample_avg_col) * 100] + + indicator_estimation_table <- table_content[ + , + .SD, + .SDcols = c(admin_name_col, grep("SAMPLE_AVERAGE", names(table_content), value = TRUE)) + ] + + table_content <- data.table::merge.data.table(admin_data, table_content, by = admin_name_col) + summary_table <- data.table::merge.data.table(summary_table, indicator_estimation_table, by = admin_name_col) + + filename_without_extension <- glue::glue("{country_code}_{data_source}_{admin_level}_{toupper(indicator_name)}") + utils::write.csv( + table_content, + file = file.path(output_data_path, paste0(filename_without_extension, ".csv")), + row.names = FALSE + ) + arrow::write_parquet(table_content, file.path(output_data_path, paste0(filename_without_extension, ".parquet"))) + + indicator_tables[[indicator_name]] <- table_content + } + + names(summary_table) <- gsub("_SAMPLE_AVERAGE", "", names(summary_table)) + summary_filename_without_extension <- glue::glue("{country_code}_{data_source}_{admin_level}_PCT_CARESEEKING_SAMPLE_AVERAGE") + utils::write.csv( + summary_table, + file = file.path(output_data_path, paste0(summary_filename_without_extension, ".csv")), + row.names = FALSE + ) + arrow::write_parquet(summary_table, file.path(output_data_path, paste0(summary_filename_without_extension, ".parquet"))) + + list(summary_table = summary_table, indicator_tables = indicator_tables) +} diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r new file mode 100644 index 0000000..58d0129 --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r @@ -0,0 +1,174 @@ +compute_and_export_indicator_table <- function( + design_obj, + indicator_name, + output_indicator_name = indicator_name, + admin_name_col, + admin_data, + output_data_path, + filename_without_extension +) { + table_content <- survey::svyby( + formula = as.formula(paste("~", indicator_name)), + by = reformulate(admin_name_col), + FUN = survey::svymean, + design = design_obj, + level = 0.95, + vartype = "ci", + na.rm = TRUE, + influence = TRUE + ) + + data.table::setDT(table_content) + lower_bound_col <- glue::glue("{toupper(output_indicator_name)}_CI_LOWER_BOUND") + upper_bound_col <- glue::glue("{toupper(output_indicator_name)}_CI_UPPER_BOUND") + sample_avg_col <- glue::glue("{toupper(output_indicator_name)}_SAMPLE_AVERAGE") + + names(table_content)[names(table_content) == "ci_l"] <- lower_bound_col + names(table_content)[names(table_content) == "ci_u"] <- upper_bound_col + names(table_content)[names(table_content) == indicator_name] <- sample_avg_col + + table_content[get(lower_bound_col) < 0, (lower_bound_col) := 0] + table_content[get(upper_bound_col) > 1, (upper_bound_col) := 1] + table_content[, (lower_bound_col) := get(lower_bound_col) * 100] + table_content[, (upper_bound_col) := get(upper_bound_col) * 100] + table_content[, (sample_avg_col) := get(sample_avg_col) * 100] + + table_content <- data.table::merge.data.table(admin_data, table_content, by = admin_name_col, all.x = TRUE) + utils::write.csv(table_content, file = file.path(output_data_path, paste0(filename_without_extension, ".csv")), row.names = FALSE) + arrow::write_parquet(table_content, file.path(output_data_path, paste0(filename_without_extension, ".parquet"))) + table_content +} + + +compute_dtp_indicator_tables <- function( + dtp_design, + vaccination_doses, + indicator_access, + admin_name_col, + admin_cols, + admin_data, + output_data_path, + country_code, + data_source, + admin_level +) { + dtp_dropout <- data.table::copy(admin_data) + dose_tables <- list() + + for (dose_number in vaccination_doses) { + vaccine_colname <- glue::glue("DTP{dose_number}") + table_name <- glue::glue("{toupper(indicator_access)}{dose_number}") + filename_without_extension <- glue::glue("{country_code}_{data_source}_{admin_level}_{table_name}") + + df <- compute_and_export_indicator_table( + design_obj = dtp_design, + indicator_name = vaccine_colname, + output_indicator_name = table_name, + admin_name_col = admin_name_col, + admin_data = admin_data, + output_data_path = output_data_path, + filename_without_extension = filename_without_extension + ) + + dose_tables[[table_name]] <- df + dtp_dropout <- data.table::merge.data.table(dtp_dropout, df, by = admin_cols) + } + + list(dtp_dropout = dtp_dropout, dose_tables = dose_tables) +} + + +compute_and_export_dtp_dropout <- function( + dtp_dropout, + vaccination_doses, + indicator_access, + indicator_attrition, + output_data_path, + country_code, + data_source, + admin_level +) { + dtp_dropout[, grep("BOUND", names(dtp_dropout), value = TRUE) := NULL] + + for (current_dose in vaccination_doses) { + for (reference_dose in 1:(current_dose - 1)) { + if ((reference_dose >= 1) & (reference_dose < current_dose)) { + attrition_col <- glue::glue("{toupper(indicator_attrition)}_{reference_dose}_{current_dose}") + numerator_colname <- glue::glue("{toupper(indicator_access)}{current_dose}_SAMPLE_AVERAGE") + denominator_colname <- glue::glue("{toupper(indicator_access)}{reference_dose}_SAMPLE_AVERAGE") + dtp_dropout[, (attrition_col) := (1 - get(numerator_colname) / get(denominator_colname)) * 100] + } + } + } + + dtp_dropout[, grep("SAMPLE_AVERAGE", names(dtp_dropout), value = TRUE) := NULL] + filename <- glue::glue("{country_code}_{data_source}_{admin_level}_{indicator_attrition}") + data.table::fwrite(dtp_dropout, file = file.path(output_data_path, paste0(filename, ".csv"))) + arrow::write_parquet(dtp_dropout, file.path(output_data_path, paste0(filename, ".parquet"))) + dtp_dropout +} + + +export_careseeking_reporting_plots <- function( + plot_data, + all_indicators, + output_plots_path, + country_code, + data_source, + admin_level +) { + for (indicator_name in all_indicators) { + plot_label <- gsub("PCT ", "", gsub("_", " ", indicator_name)) + indicator_plot <- make_dhs_map( + plot_dt = plot_data, + plot_colname = indicator_name, + title_name = glue::glue("Percentage children: {plot_label}"), + legend_title = "%", + scale_limits = c(0, 100) + ) + ggplot2::ggsave( + indicator_plot, + file = file.path(output_plots_path, glue::glue("{country_code}_{data_source}_{admin_level}_{toupper(indicator_name)}_plot.png")), + dpi = 500 + ) + } +} + + +export_careseeking_reporting_ci_plots <- function( + all_indicators, + output_data_path, + output_plots_path, + country_code, + data_source, + admin_level, + admin_name_col +) { + for (indicator_name in all_indicators) { + indicator_label <- gsub("_", " ", indicator_name) + ci_data <- data.table::fread( + file.path(output_data_path, glue::glue("{country_code}_{data_source}_{admin_level}_{indicator_name}.csv")) + ) + + sample_avg_col <- glue::glue("{indicator_name}_SAMPLE_AVERAGE") + lower_bound_col <- glue::glue("{indicator_name}_CI_LOWER_BOUND") + upper_bound_col <- glue::glue("{indicator_name}_CI_UPPER_BOUND") + ci_plot <- make_ci_plot( + df_to_plot = ci_data, + admin_colname = admin_name_col, + point_estimation_colname = sample_avg_col, + ci_lower_colname = lower_bound_col, + ci_upper_colname = upper_bound_col, + title_name = glue::glue("{country_code} {data_source} {indicator_label} CI"), + x_title = admin_name_col, + y_title = glue::glue("{indicator_label} (%)") + ) + ggplot2::ggsave( + plot = ci_plot, + filename = file.path(output_plots_path, glue::glue("{country_code}_{data_source}_{admin_level}_{toupper(indicator_name)}_CI_plot.png")), + width = 8, + height = 6, + dpi = 300 + ) + } +} diff --git a/pipelines/snt_healthcare_access/code/snt_healthcare_access.ipynb b/pipelines/snt_healthcare_access/code/snt_healthcare_access.ipynb index 6b56c62..3e61276 100644 --- a/pipelines/snt_healthcare_access/code/snt_healthcare_access.ipynb +++ b/pipelines/snt_healthcare_access/code/snt_healthcare_access.ipynb @@ -58,6 +58,7 @@ "PROJECT_PATH <- file.path(ROOT_PATH, \"pipelines/snt_healthcare_access\")\n", "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "UTILS_PATH <- file.path(PROJECT_PATH, 'utils')\n", "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", "\n", "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'healthcare_access')\n", @@ -126,8 +127,9 @@ "# Global settings\n", "options(scipen=999)\n", "\n", - "# Load snt utils \n", + "# Load snt utils\n", "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(UTILS_PATH, \"snt_healthcare_access.r\"))\n", "\n", "# Required packages # \"geojsonio\", #\"RColorBrewer\",\n", "required_packages <- c(\"jsonlite\", \"dplyr\", \"data.table\", \"ggplot2\", \"arrow\", \"glue\", \"sf\", \"terra\", \"httr\", \"reticulate\", \"arrow\", \"stringr\")\n", @@ -142,9 +144,9 @@ "# Load SNT config\n", "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", " })\n", "\n", "log_msg(glue(\"SNT configuration loaded from: {file.path(CONFIG_PATH, 'SNT_config.json')}\"))" @@ -225,34 +227,11 @@ "source": [ "# load as vector data\n", "dhis2_formatted_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "if (!is.null(SHAPES_FILE) && !is.na(SHAPES_FILE) && trimws(SHAPES_FILE) != \"\") {\n", - " custom_shapes_path <- path.expand(SHAPES_FILE)\n", - " if (!file.exists(custom_shapes_path)) {\n", - " stop(glue(\"[ERROR] Custom shapes file was provided but does not exist: {custom_shapes_path}\"))\n", - " }\n", - "\n", - " spatial_units_data <- tryCatch(\n", - " {\n", - " st_read(custom_shapes_path, quiet = TRUE)\n", - " },\n", - " error = function(e) {\n", - " stop(glue(\"[ERROR] Error while loading custom shapes file: {custom_shapes_path} [ERROR DETAILS] {conditionMessage(e)}\"))\n", - " }\n", - " )\n", - "\n", - " log_msg(glue(\"Custom shapes file loaded successfully: {custom_shapes_path}\"))\n", - " log_msg(\n", - " \"[WARNING] Using a custom shapefile: hierarchy may not align with the extracted DHIS2 pyramid. During data assembly, this mismatch can result in missing values for some organizational units (especially at ADM2 level) if IDs do not match or do not exist in both files.\",\n", - " level = \"warning\"\n", - " )\n", - "} else {\n", - " spatial_units_data <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_formatted_dataset, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) {\n", - " stop(glue(\"[ERROR] Error while loading DHIS2 Shapes data for: {paste0(COUNTRY_CODE, '_shapes.geojson')} [ERROR DETAILS] {conditionMessage(e)}\"))\n", - " })\n", - " log_msg(glue(\"Default HMIS/NMDR shapes file downloaded successfully from dataset: {dhis2_formatted_dataset}\"))\n", - "}" + "spatial_units_data <- load_spatial_units_data(\n", + " shapes_file = SHAPES_FILE,\n", + " dhis2_dataset = dhis2_formatted_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")" ] }, { @@ -279,24 +258,14 @@ }, "outputs": [], "source": [ - "# change CRS (Coordinate Reference System)\n", - "spatial_units_data <- reproject_epsg(spatial_units_data, country_epsg_degrees)\n", - "\n", - "# Drop rows with null/empty/invalid geometry (terra::vect() fails otherwise, e.g. Cameroon)\n", - "n_before <- nrow(spatial_units_data)\n", - "spatial_units_data <- spatial_units_data %>%\n", - " filter(!is.na(st_is_valid(.)), st_is_valid(.), !st_is_empty(.))\n", - "if (nrow(spatial_units_data) < n_before) {\n", - " log_msg(glue(\"Dropped {n_before - nrow(spatial_units_data)} spatial unit(s) with null/empty/invalid geometry.\"))\n", - "}\n", - "\n", - "# Make the related data objects: the admin data and the country polygon\n", - "# drop geometry to get attribute table (as a data.table)\n", - "admin_data <- setDT(st_drop_geometry(spatial_units_data))\n", - "setDT(admin_data)\n", + "prepared_spatial <- prepare_spatial_admin_objects(\n", + " spatial_units_data = spatial_units_data,\n", + " country_epsg_degrees = country_epsg_degrees\n", + ")\n", "\n", - "# make the whole country polygon\n", - "all_country <- st_union(spatial_units_data)" + "spatial_units_data <- prepared_spatial$spatial_units_data\n", + "admin_data <- prepared_spatial$admin_data\n", + "all_country <- prepared_spatial$all_country" ] }, { @@ -832,33 +801,13 @@ }, "outputs": [], "source": [ - "# total population per admin unit\n", - "pop_total_by_adm2 <- terra::zonal(\n", - " pop_healthcare_data$POP_TOTAL, # template raster\n", - " adm2_raster,\n", - " fun = \"sum\",\n", - " na.rm = TRUE\n", - ")\n", - "\n", - "log_msg(\"Aggregated the total population by spatial units.\")\n", - "\n", - "# covered population per admin unit\n", - "pop_cov_by_adm2 <- terra::zonal(\n", - " pop_covered_healthcare,\n", - " adm2_raster,\n", - " fun = \"sum\",\n", - " na.rm = TRUE\n", - ")\n", - "\n", - "log_msg(\"Aggregated the covered population by spatial units.\")\n", - "\n", - "adm2_pop_total <- setDT(as.data.frame(pop_total_by_adm2))\n", - "adm2_pop_covered <- setDT(as.data.frame(pop_cov_by_adm2))\n", - "\n", - "output_df <- merge.data.table(adm2_pop_total, adm2_pop_covered, by = admin_col, all = TRUE)\n", - "\n", - "if(nrow(output_df) != nrow(adm2_pop_total)) \n", - " stop(\"Error: There was an error when computing covered population.\") " + "output_df <- compute_population_by_admin(\n", + " pop_total_raster = pop_healthcare_data$POP_TOTAL,\n", + " pop_covered_raster = pop_covered_healthcare,\n", + " adm_raster = adm2_raster,\n", + " admin_col = admin_col,\n", + " admin_data = admin_data\n", + ")" ] }, { @@ -872,9 +821,6 @@ }, "outputs": [], "source": [ - "# make the percentage covered column\n", - "output_df$PCT_HEALTH_ACCESS <- output_df$POP_COVERED*100 / output_df$POP_TOTAL\n", - "output_df <- merge.data.table(admin_data, output_df, by = admin_col, all.x = TRUE)\n", "head(output_df)" ] }, diff --git a/pipelines/snt_healthcare_access/utils/snt_healthcare_access.r b/pipelines/snt_healthcare_access/utils/snt_healthcare_access.r new file mode 100644 index 0000000..7cf1db2 --- /dev/null +++ b/pipelines/snt_healthcare_access/utils/snt_healthcare_access.r @@ -0,0 +1,90 @@ +load_spatial_units_data <- function(shapes_file, dhis2_dataset, country_code) { + if (!is.null(shapes_file) && !is.na(shapes_file) && trimws(shapes_file) != "") { + custom_shapes_path <- path.expand(shapes_file) + if (!file.exists(custom_shapes_path)) { + stop(glue::glue("[ERROR] Custom shapes file was provided but does not exist: {custom_shapes_path}")) + } + + spatial_units_data <- tryCatch( + { + sf::st_read(custom_shapes_path, quiet = TRUE) + }, + error = function(e) { + stop(glue::glue( + "[ERROR] Error while loading custom shapes file: {custom_shapes_path} [ERROR DETAILS] {conditionMessage(e)}" + )) + } + ) + + log_msg(glue::glue("Custom shapes file loaded successfully: {custom_shapes_path}")) + log_msg( + "[WARNING] Using a custom shapefile: hierarchy may not align with the extracted DHIS2 pyramid. During data assembly, this mismatch can result in missing values for some organizational units (especially at ADM2 level) if IDs do not match or do not exist in both files.", + level = "warning" + ) + return(spatial_units_data) + } + + spatial_units_data <- tryCatch( + { + get_latest_dataset_file_in_memory(dhis2_dataset, paste0(country_code, "_shapes.geojson")) + }, + error = function(e) { + stop(glue::glue( + "[ERROR] Error while loading DHIS2 Shapes data for: {paste0(country_code, '_shapes.geojson')} [ERROR DETAILS] {conditionMessage(e)}" + )) + } + ) + log_msg(glue::glue("Default HMIS/NMDR shapes file downloaded successfully from dataset: {dhis2_dataset}")) + spatial_units_data +} + + +prepare_spatial_admin_objects <- function(spatial_units_data, country_epsg_degrees) { + spatial_units_data <- reproject_epsg(spatial_units_data, country_epsg_degrees) + + n_before <- nrow(spatial_units_data) + spatial_units_data <- spatial_units_data %>% + dplyr::filter(!is.na(sf::st_is_valid(.)), sf::st_is_valid(.), !sf::st_is_empty(.)) + if (nrow(spatial_units_data) < n_before) { + log_msg(glue::glue("Dropped {n_before - nrow(spatial_units_data)} spatial unit(s) with null/empty/invalid geometry.")) + } + + admin_data <- data.table::setDT(sf::st_drop_geometry(spatial_units_data)) + all_country <- sf::st_union(spatial_units_data) + + list( + spatial_units_data = spatial_units_data, + admin_data = admin_data, + all_country = all_country + ) +} + + +compute_population_by_admin <- function(pop_total_raster, pop_covered_raster, adm_raster, admin_col, admin_data) { + pop_total_by_adm2 <- terra::zonal( + pop_total_raster, + adm_raster, + fun = "sum", + na.rm = TRUE + ) + log_msg("Aggregated the total population by spatial units.") + + pop_cov_by_adm2 <- terra::zonal( + pop_covered_raster, + adm_raster, + fun = "sum", + na.rm = TRUE + ) + log_msg("Aggregated the covered population by spatial units.") + + adm2_pop_total <- data.table::setDT(as.data.frame(pop_total_by_adm2)) + adm2_pop_covered <- data.table::setDT(as.data.frame(pop_cov_by_adm2)) + output_df <- data.table::merge.data.table(adm2_pop_total, adm2_pop_covered, by = admin_col, all = TRUE) + + if (nrow(output_df) != nrow(adm2_pop_total)) { + stop("Error: There was an error when computing covered population.") + } + + output_df$PCT_HEALTH_ACCESS <- output_df$POP_COVERED * 100 / output_df$POP_TOTAL + data.table::merge.data.table(admin_data, output_df, by = admin_col, all.x = TRUE) +} diff --git a/pipelines/snt_map_extracts/reporting/snt_map_extracts_report.ipynb b/pipelines/snt_map_extracts/reporting/snt_map_extracts_report.ipynb index 87af083..b53b685 100644 --- a/pipelines/snt_map_extracts/reporting/snt_map_extracts_report.ipynb +++ b/pipelines/snt_map_extracts/reporting/snt_map_extracts_report.ipynb @@ -1,196 +1,217 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "5777b72c-d87e-47c5-87b1-2698a6510b2f", - "metadata": {}, - "source": [ - "# **Cartes extraites du Malaria Atlas Project (MAP)**" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "5777b72c-d87e-47c5-87b1-2698a6510b2f", + "metadata": {}, + "source": [ + "# **Cartes extraites du Malaria Atlas Project (MAP)**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6397ab91-1ae4-4db7-b6c3-061c453a7b03", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_map_extracts\")\n", + "\n", + "# load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_map_extracts_report.r\"))\n", + "\n", + "# List required packages\n", + "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"purrr\", \"arrow\", \"sf\", \"reticulate\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)\n", + "\n", + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "# Required environment for the sf packages\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93e04996-13ba-4855-a1d1-46e70ba4640e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration variables\n", + "DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$SNT_MAP_EXTRACT\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1ee21d1-c7d1-4893-ac56-91abb92926ea", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# printdim() loaded from utils/snt_map_extracts_report.r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7de799e-896c-4237-a9f6-9dafc0f30bde", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# import seasonality data\n", + "map_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, paste0(COUNTRY_CODE, \"_map_data.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "# import DHIS2 shapes data\n", + "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "printdim(map_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84031c7e-e9c6-4496-896f-7f7f3403d951", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "names(map_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66b05b53-3f65-424a-af22-0686238a06c9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "unique(map_data$METRIC_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d0515ab-4dc2-4671-8c3c-236578a840d8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Merge geometry with map data\n", + "map_data_joined <- dplyr::left_join(shapes_data, map_data, by = c(\"ADM2_ID\" = \"ADM2_ID\"))\n", + "\n", + "# Get list of metrics\n", + "metrics <- unique(map_data$METRIC_NAME)\n", + "\n", + "# Create one map per metric\n", + "plots <- build_metric_plots(map_data_joined = map_data_joined, metrics = metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f1dc1df-211d-4174-83ae-e8ae974fa790", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set plot size for individual display\n", + "options(repr.plot.width = 10, repr.plot.height = 8)\n", + "\n", + "# Loop through plots and print one by one\n", + "for (p in plots) {\n", + " print(p)\n", + " Sys.sleep(1) # Optional: short pause between plots\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50c843e4-9157-480d-acde-80887410d156", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "6397ab91-1ae4-4db7-b6c3-061c453a7b03", - "metadata": {}, - "outputs": [], - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"purrr\", \"arrow\", \"sf\", \"reticulate\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "# Required environment for the sf packages\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93e04996-13ba-4855-a1d1-46e70ba4640e", - "metadata": {}, - "outputs": [], - "source": [ - "# Configuration variables\n", - "DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$SNT_MAP_EXTRACT\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1ee21d1-c7d1-4893-ac56-91abb92926ea", - "metadata": {}, - "outputs": [], - "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7de799e-896c-4237-a9f6-9dafc0f30bde", - "metadata": {}, - "outputs": [], - "source": [ - "# import seasonality data\n", - "map_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, paste0(COUNTRY_CODE, \"_map_data.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "# import DHIS2 shapes data\n", - "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "printdim(map_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "84031c7e-e9c6-4496-896f-7f7f3403d951", - "metadata": {}, - "outputs": [], - "source": [ - "names(map_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66b05b53-3f65-424a-af22-0686238a06c9", - "metadata": {}, - "outputs": [], - "source": [ - "unique(map_data$METRIC_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d0515ab-4dc2-4671-8c3c-236578a840d8", - "metadata": {}, - "outputs": [], - "source": [ - "# Merge geometry with map data\n", - "map_data_joined <- left_join(shapes_data, map_data, by = c(\"ADM2_ID\" = \"ADM2_ID\"))\n", - "\n", - "# Get list of metrics\n", - "metrics <- unique(map_data$METRIC_NAME)\n", - "\n", - "# Create one map per metric\n", - "plots <- map(metrics, function(metric) {\n", - " ggplot(map_data_joined %>% filter(METRIC_NAME == metric)) +\n", - " geom_sf(aes(fill = VALUE), color = \"white\") +\n", - " scale_fill_viridis_c(option = \"C\", na.value = \"lightgrey\") +\n", - " labs(\n", - " title = paste0(metric),\n", - " fill = \"Valeur\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(size = 20, face = \"bold\"),\n", - " legend.title = element_text(size = 16),\n", - " legend.text = element_text(size = 14)\n", - " )\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f1dc1df-211d-4174-83ae-e8ae974fa790", - "metadata": {}, - "outputs": [], - "source": [ - "# Set plot size for individual display\n", - "options(repr.plot.width = 10, repr.plot.height = 8)\n", - "\n", - "# Loop through plots and print one by one\n", - "for (p in plots) {\n", - " print(p)\n", - " Sys.sleep(1) # Optional: short pause between plots\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50c843e4-9157-480d-acde-80887410d156", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_map_extracts/utils/snt_map_extracts_report.r b/pipelines/snt_map_extracts/utils/snt_map_extracts_report.r new file mode 100644 index 0000000..eed83a0 --- /dev/null +++ b/pipelines/snt_map_extracts/utils/snt_map_extracts_report.r @@ -0,0 +1,22 @@ +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} + + +build_metric_plots <- function(map_data_joined, metrics) { + purrr::map(metrics, function(metric) { + ggplot2::ggplot(map_data_joined %>% dplyr::filter(METRIC_NAME == metric)) + + ggplot2::geom_sf(ggplot2::aes(fill = VALUE), color = "white") + + ggplot2::scale_fill_viridis_c(option = "C", na.value = "lightgrey") + + ggplot2::labs( + title = paste0(metric), + fill = "Valeur" + ) + + ggplot2::theme_minimal(base_size = 16) + + ggplot2::theme( + plot.title = ggplot2::element_text(size = 20, face = "bold"), + legend.title = ggplot2::element_text(size = 16), + legend.text = ggplot2::element_text(size = 14) + ) + }) +} diff --git a/pipelines/snt_seasonality_rainfall/code/snt_seasonality_rainfall.ipynb b/pipelines/snt_seasonality_rainfall/code/snt_seasonality_rainfall.ipynb index 279cbc6..b17fee4 100644 --- a/pipelines/snt_seasonality_rainfall/code/snt_seasonality_rainfall.ipynb +++ b/pipelines/snt_seasonality_rainfall/code/snt_seasonality_rainfall.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "5eebc540-e973-497e-8427-e73d546fdd09", "metadata": { "editable": true, "slideshow": { @@ -40,40 +41,41 @@ "---\n", "\n", "## Preliminaries" - ], - "id": "5eebc540-e973-497e-8427-e73d546fdd09" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "273b05d8-d287-4acc-bd43-5ba6642bd9fa", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# install.packages(\"fpp3\", repos = \"https://cloud.r-project.org\")" - ], - "execution_count": null, - "outputs": [], - "id": "273b05d8-d287-4acc-bd43-5ba6642bd9fa" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "b6b4eaed", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Clear environment\n", "rm(list=ls())" - ], - "execution_count": null, - "outputs": [], - "id": "b6b4eaed" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "18d197b6-9de8-4e4b-bc4e-8b452de67287", "metadata": { "editable": true, "slideshow": { @@ -84,19 +86,19 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Global settings\n", "options(scipen=999)\n", "\n", "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ], - "execution_count": null, - "outputs": [], - "id": "18d197b6-9de8-4e4b-bc4e-8b452de67287" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "a6915379-108e-4405-b553-b074aad447d6", "metadata": { "editable": true, "slideshow": { @@ -107,6 +109,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Paths\n", "ROOT_PATH <- '~/workspace'\n", @@ -116,34 +119,34 @@ "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'seasonality_rainfall')\n", "INTERMEDIATE_RESULTS_PATH <- file.path(OUTPUT_DATA_PATH, \"intermediate_results\")" - ], - "execution_count": null, - "outputs": [], - "id": "a6915379-108e-4405-b553-b074aad447d6" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f94d1e6c-0675-4349-b6e0-a28197c8c9e4", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Load utils\n", "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_seasonality_rainfall.r\"))" - ], - "execution_count": null, - "outputs": [], - "id": "f94d1e6c-0675-4349-b6e0-a28197c8c9e4" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "180b93e7-61af-4981-863f-593b755968bd", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# List required pcks\n", "required_packages <- c(\n", @@ -161,13 +164,12 @@ "\n", "# Execute function\n", "install_and_load(required_packages)" - ], - "execution_count": null, - "outputs": [], - "id": "180b93e7-61af-4981-863f-593b755968bd" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "609cd062-d7d9-42de-976b-10f8a0bfc18a", "metadata": { "editable": true, "slideshow": { @@ -178,6 +180,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", "reticulate::py_config()$python\n", @@ -192,13 +195,12 @@ " log_msg(error_msg, level = \"error\")\n", " stop(error_msg)\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "609cd062-d7d9-42de-976b-10f8a0bfc18a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "458b3d65-cc7e-41bc-95fd-7011dcd5528f", "metadata": { "editable": true, "slideshow": { @@ -209,6 +211,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Load SNT config\n", "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", @@ -228,45 +231,44 @@ "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "\n", "print(paste(\"Country code: \", COUNTRY_CODE))" - ], - "execution_count": null, - "outputs": [], - "id": "458b3d65-cc7e-41bc-95fd-7011dcd5528f" + ] }, { "cell_type": "markdown", + "id": "804a1bd1-26c8-4f6a-af35-3eba64fe0741", "metadata": {}, "source": [ "## Globals and parameters" - ], - "id": "804a1bd1-26c8-4f6a-af35-3eba64fe0741" + ] }, { "cell_type": "markdown", + "id": "414f9ee0-5264-43c4-992f-cff6c719d65c", "metadata": {}, "source": [ "**Parameters**" - ], - "id": "414f9ee0-5264-43c4-992f-cff6c719d65c" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "fb82560d-c123-4c54-bfa1-fb5f05e4ad69", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "minimum_periods <- as.integer(48)\n", "maximum_proportion_missings_overall <- 0.1\n", "maximum_proportion_missings_per_district <- 0.2" - ], - "execution_count": null, - "outputs": [], - "id": "fb82560d-c123-4c54-bfa1-fb5f05e4ad69" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9024226d-5845-48a0-8ae4-e7b5a8d11988", "metadata": { "editable": true, "slideshow": { @@ -279,6 +281,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Fallback parameter values for local/dev execution\n", "# When run via pipeline, these are injected by Papermill in the first cell\n", @@ -308,21 +311,20 @@ "log_msg(paste(\"Threshold for seasonality:\", threshold_for_seasonality))\n", "log_msg(paste(\"Threshold proportion seasonal years:\", threshold_proportion_seasonal_years))\n", "log_msg(paste(\"Use calendar year denominator:\", use_calendar_year_denominator))" - ], - "execution_count": null, - "outputs": [], - "id": "9024226d-5845-48a0-8ae4-e7b5a8d11988" + ] }, { "cell_type": "markdown", + "id": "0b2d3bb6-6351-4f32-92de-44a6579b6630", "metadata": {}, "source": [ "**Fixed routine formatting columns**" - ], - "id": "0b2d3bb6-6351-4f32-92de-44a6579b6630" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "90b27881-b25d-4cb3-8b2f-4dd1b395bdee", "metadata": { "editable": true, "slideshow": { @@ -333,6 +335,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Global variables\n", "type_of_seasonality <- \"rainfall\"\n", @@ -357,13 +360,12 @@ "year_col <- 'YEAR'\n", "month_col <- 'MONTH'\n", "period_cols <- c(year_col, month_col)" - ], - "execution_count": null, - "outputs": [], - "id": "90b27881-b25d-4cb3-8b2f-4dd1b395bdee" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "473308f4-4630-4d9e-82a9-b2b4fc9134db", "metadata": { "editable": true, "slideshow": { @@ -374,42 +376,42 @@ "languageId": "r" } }, + "outputs": [], "source": [ "possible_month_block_sizes <- as.integer(minimum_month_block_size:maximum_month_block_size)\n", "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", "print(paste(\"Formatted threshold :\",formatted_threshold_for_seasonality))" - ], - "execution_count": null, - "outputs": [], - "id": "473308f4-4630-4d9e-82a9-b2b4fc9134db" + ] }, { "cell_type": "markdown", + "id": "86f492f3-5634-4987-a2b8-23014aba5d51", "metadata": {}, "source": [ "## Load data" - ], - "id": "86f492f3-5634-4987-a2b8-23014aba5d51" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "623480ee-4310-4ead-a8c8-bf294527c814", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Load spatial file from dataset\n", "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "623480ee-4310-4ead-a8c8-bf294527c814" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1f766ea1-dced-4143-a5be-fdc51da4bd8d", "metadata": { "editable": true, "slideshow": { @@ -420,23 +422,24 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Load rainfall data from dataset\n", "rainfall_data_filename <- paste(COUNTRY_CODE, \"total_precipitation_monthly.parquet\", sep = \"_\")\n", "original_dt <- get_latest_dataset_file_in_memory(era5_dataset, rainfall_data_filename)\n", "log_msg(glue(\"File {rainfall_data_filename} successfully loaded from dataset version: {era5_dataset}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "1f766ea1-dced-4143-a5be-fdc51da4bd8d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "7b769deb-52e5-471d-9950-ac431dd8cf03", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Columns formatting\n", "admin_data <- st_drop_geometry(spatial_data)\n", @@ -449,37 +452,35 @@ "rain_proportion_col <- 'RAIN_PROPORTION'\n", "final_table_cols <- c(names(admin_data), seasonality_col, season_duration_col, season_start_month_col, rain_proportion_col)\n", "print(final_table_cols)" - ], - "execution_count": null, - "outputs": [], - "id": "7b769deb-52e5-471d-9950-ac431dd8cf03" + ] }, { "cell_type": "markdown", + "id": "0d329af2-f544-4ee2-940f-65e2ab11c49d", "metadata": {}, "source": [ "**Create the containers for the data**" - ], - "id": "0d329af2-f544-4ee2-940f-65e2ab11c49d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "90486c1e-38bc-4c6f-bffe-b7e8f3be68ca", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Create an empty table if the analysis is stopped for lack of enough data\n", "seasonality_cols <- c(seasonality_col, season_duration_col, season_start_month_col, rain_proportion_col)\n", "empty_dt <- copy(admin_data)[, (seasonality_cols) := NA]" - ], - "execution_count": null, - "outputs": [], - "id": "90486c1e-38bc-4c6f-bffe-b7e8f3be68ca" + ] }, { "cell_type": "markdown", + "id": "b8da71be-45f1-405c-857c-ed86984988f4", "metadata": { "editable": true, "slideshow": { @@ -489,11 +490,12 @@ }, "source": [ "## Preprocess input data" - ], - "id": "b8da71be-45f1-405c-857c-ed86984988f4" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c5bf0faa-357e-44a7-af0c-04dd382af7e0", "metadata": { "editable": true, "slideshow": { @@ -504,6 +506,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# format table\n", "setDT(original_dt)\n", @@ -511,13 +514,12 @@ "numeric_cols <- c(original_values_col)\n", "original_dt[, (integer_cols) := lapply(.SD, as.integer), .SDcols = integer_cols]\n", "# head(original_dt)" - ], - "execution_count": null, - "outputs": [], - "id": "c5bf0faa-357e-44a7-af0c-04dd382af7e0" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "a1a762ad-943e-467b-8cc1-e4998a996b9f", "metadata": { "editable": true, "slideshow": { @@ -528,6 +530,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# keep only the useful columns and aggregate the data on them\n", "original_dt <- original_dt[,\n", @@ -559,34 +562,34 @@ " log_msg(\"There are too many missing values in the data overall. Abandoning analysis.\", level=\"error\")\n", " stop(\"ERROR 2\") \n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "a1a762ad-943e-467b-8cc1-e4998a996b9f" + ] }, { "cell_type": "markdown", + "id": "e3d793a5-ac96-4dcc-bd86-5837a631ea54", "metadata": {}, "source": [ "### Imputation of missings" - ], - "id": "e3d793a5-ac96-4dcc-bd86-5837a631ea54" + ] }, { "cell_type": "markdown", + "id": "1b7c767b-5343-4d7a-ad6a-aac11ee2ba12", "metadata": {}, "source": [ "**Remove impute files (if any)**" - ], - "id": "1b7c767b-5343-4d7a-ad6a-aac11ee2ba12" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "ac3414c6-baf1-47f0-ad6d-5ff1cb0e432e", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Remove existing imputation files\n", "filename_imputed_dt <- paste(COUNTRY_CODE, type_of_seasonality, 'imputed.csv', sep = '_')\n", @@ -594,13 +597,12 @@ "files_to_remove <- files_in_folder[grepl(filename_imputed_dt, basename(files_in_folder), ignore.case = TRUE)]\n", "file.remove(files_to_remove)\n", "print(glue(\"Deleted files: {str(files_to_remove)}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "ac3414c6-baf1-47f0-ad6d-5ff1cb0e432e" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "8cf34a4f-f429-4ee5-9919-5e5c7abe9da6", "metadata": { "editable": true, "slideshow": { @@ -611,6 +613,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# create the name of the column which will store the imputed/estimated values\n", "imputed_col = paste(original_values_col, 'EST', sep = '_')\n", @@ -662,13 +665,11 @@ " imputed_dt <- copy(original_dt)\n", " imputed_dt[, (imputed_col) := get(original_values_col)]\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "8cf34a4f-f429-4ee5-9919-5e5c7abe9da6" + ] }, { "cell_type": "markdown", + "id": "9db44942-d844-491c-9045-906e99a37c60", "metadata": { "editable": true, "slideshow": { @@ -678,11 +679,12 @@ }, "source": [ "## Seasonality" - ], - "id": "9db44942-d844-491c-9045-906e99a37c60" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9bfa3ebf-3a04-405a-9cc6-5f174a08f70b", "metadata": { "editable": true, "slideshow": { @@ -693,6 +695,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Step 1: Compute month-level seasonality indicators\n", "# For each row (period-admin unit), determine if it marks the start of a seasonal block\n", @@ -732,13 +735,11 @@ "# for those admin units without any seasonality, the duration of the block will be 'infinite')\n", "check_pattern_seasonality <- paste(\"^SEASONALITY\", toupper(type_of_seasonality), \"[0-9]+_MTH$\", sep = \"_\")\n", "seasonality_source_dt <- seasonality_source_dt[, .SD, .SDcols = c(admin_id_col, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]" - ], - "execution_count": null, - "outputs": [], - "id": "9bfa3ebf-3a04-405a-9cc6-5f174a08f70b" + ] }, { "cell_type": "markdown", + "id": "d9f8270f-7283-4630-b9ba-62366b1c3e62", "metadata": { "editable": true, "slideshow": { @@ -748,27 +749,28 @@ }, "source": [ "## Result file" - ], - "id": "d9f8270f-7283-4630-b9ba-62366b1c3e62" + ] }, { "cell_type": "markdown", + "id": "477fb459-0f98-4a32-96ab-f10b4395495f", "metadata": {}, "source": [ "### long" - ], - "id": "477fb459-0f98-4a32-96ab-f10b4395495f" + ] }, { "cell_type": "markdown", + "id": "db719aed-6347-48f4-8984-add9f8adec2d", "metadata": {}, "source": [ "This format, until further notice, is not saved." - ], - "id": "db719aed-6347-48f4-8984-add9f8adec2d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "50ba68f5-e9a9-4144-99dc-de8cc44770e9", "metadata": { "editable": true, "slideshow": { @@ -779,6 +781,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "seasonality_long_dt <- melt(\n", " seasonality_source_dt,\n", @@ -786,13 +789,12 @@ " variable.name = 'MONTH_BLOCK_SIZE',\n", " value.name =seasonality_col\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "50ba68f5-e9a9-4144-99dc-de8cc44770e9" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "b2a400d4-a545-40ab-9204-b6b2e69ea499", "metadata": { "editable": true, "slideshow": { @@ -803,34 +805,34 @@ "languageId": "r" } }, + "outputs": [], "source": [ "seasonality_long_dt[, MONTH_BLOCK_SIZE := possible_month_block_sizes[match(MONTH_BLOCK_SIZE, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]]\n", "\n", "# add remaining admin unit columns and save the final results\n", "admin_seasonality_long_dt <- merge.data.table(admin_data, seasonality_long_dt, by = c(admin_id_col), all = TRUE)" - ], - "execution_count": null, - "outputs": [], - "id": "b2a400d4-a545-40ab-9204-b6b2e69ea499" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "7a7926d5-d4e7-4707-85b8-7020c3738be3", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# order the columns\n", "specific_cols <- setdiff(names(admin_seasonality_long_dt), names(admin_data)) # last columns\n", "admin_seasonality_long_dt <- admin_seasonality_long_dt[, .SD, .SDcols = c(common_cols, specific_cols)]" - ], - "execution_count": null, - "outputs": [], - "id": "7a7926d5-d4e7-4707-85b8-7020c3738be3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "343a07c2-2e0e-49eb-b487-cc8d6431e015", "metadata": { "editable": true, "slideshow": { @@ -841,18 +843,17 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Keeping for now.\n", "# # filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", "# filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", "# fwrite(admin_seasonality_long_dt, file.path(OUTPUT_DATA_PATH, filename_admin_seasonality_long_dt))" - ], - "execution_count": null, - "outputs": [], - "id": "343a07c2-2e0e-49eb-b487-cc8d6431e015" + ] }, { "cell_type": "markdown", + "id": "36d1f9cb-75b6-4f6a-a18c-b34eb233b8d2", "metadata": { "editable": true, "slideshow": { @@ -862,11 +863,12 @@ }, "source": [ "### Transform to wide format" - ], - "id": "36d1f9cb-75b6-4f6a-a18c-b34eb233b8d2" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "e71259c9-29a6-452f-8949-74adb0e62c1c", "metadata": { "editable": true, "slideshow": { @@ -877,6 +879,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "seasonality_wide_dt <- compute_min_seasonality_block(\n", " input_dt=seasonality_source_dt,\n", @@ -885,13 +888,12 @@ " seasonal_blocksize_colname=season_duration_col,\n", " valid_value = 1\n", ")" - ], - "execution_count": null, - "outputs": [], - "id": "e71259c9-29a6-452f-8949-74adb0e62c1c" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9284c723-97d4-46f9-8837-7f70dae92a31", "metadata": { "editable": true, "slideshow": { @@ -902,6 +904,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Create a new, overall column 'SEASONALITY_' based on the values of columns in 'check_pattern_seasonality'\n", "seasonality_pattern_cols <- grep(check_pattern_seasonality, names(seasonality_wide_dt), value = TRUE)\n", @@ -947,13 +950,12 @@ "\n", "# Set SEASONAL_BLOCK_START_MONTH to NA for non-seasonal admin units\n", "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (season_start_month_col) := NA_integer_]" - ], - "execution_count": null, - "outputs": [], - "id": "9284c723-97d4-46f9-8837-7f70dae92a31" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c5ea18ef-1148-432d-a954-b9c6b4a06afc", "metadata": { "editable": true, "slideshow": { @@ -964,26 +966,26 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# add remaining admin unit columns and save the final results\n", "admin_seasonality_wide_dt <- merge.data.table(admin_data, seasonality_wide_dt, by = c(admin_id_col), all = TRUE)\n", "admin_seasonality_wide_dt <- admin_seasonality_wide_dt[, .SD, .SDcols = c(common_cols, seasonality_cols)]\n", "# head(admin_seasonality_wide_dt)" - ], - "execution_count": null, - "outputs": [], - "id": "c5ea18ef-1148-432d-a954-b9c6b4a06afc" + ] }, { "cell_type": "markdown", + "id": "a2f1a373-4b34-42db-b591-b25c7050dee6", "metadata": {}, "source": [ "**Save output**" - ], - "id": "a2f1a373-4b34-42db-b591-b25c7050dee6" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "ded1e86d-f4c5-4fdb-a6ba-611d5c7f4aed", "metadata": { "editable": true, "slideshow": { @@ -994,6 +996,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Create the filename\n", "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'seasonality', sep = '_')\n", @@ -1002,38 +1005,35 @@ "fwrite(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", "write_parquet(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", "log_msg(paste0(\"Rainfall seasonality results saved in folder \", OUTPUT_DATA_PATH))" - ], - "execution_count": null, - "outputs": [], - "id": "ded1e86d-f4c5-4fdb-a6ba-611d5c7f4aed" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9533db1c-a9db-42be-a4cd-4076d9f212ba", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, \"row_seasonality.csv\"))" - ], - "execution_count": null, - "outputs": [], - "id": "9533db1c-a9db-42be-a4cd-4076d9f212ba" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4b39e53f-f4fe-49e4-8e4d-9a6679313a54", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# fwrite(seasonality_source_dt, file.path(OUTPUT_DATA_PATH, \"processed_seasonality.csv\"))" - ], - "execution_count": null, - "outputs": [], - "id": "4b39e53f-f4fe-49e4-8e4d-9a6679313a54" + ] } ], "metadata": { @@ -1053,4 +1053,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/pipelines/snt_worldpop_extract/reporting/snt_worldpop_extract_report.ipynb b/pipelines/snt_worldpop_extract/reporting/snt_worldpop_extract_report.ipynb index 8ff90eb..2fba875 100644 --- a/pipelines/snt_worldpop_extract/reporting/snt_worldpop_extract_report.ipynb +++ b/pipelines/snt_worldpop_extract/reporting/snt_worldpop_extract_report.ipynb @@ -1,494 +1,561 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "fdcd4199-5098-4ed6-90b2-a2c678fbc1ae", - "metadata": {}, - "source": [ - "# **WorldPop versus DHIS2 population comparison analysis**\n" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "fdcd4199-5098-4ed6-90b2-a2c678fbc1ae", + "metadata": {}, + "source": [ + "# **WorldPop versus DHIS2 population comparison analysis**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "310b555f-69a6-44a9-920d-34b517e72804", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_worldpop_extract\")\n", + "\n", + "# load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_worldpop_extract_report.r\"))\n", + "\n", + "# List required packages\n", + "required_packages <- c(\"tidyr\", \"terra\", \"arrow\", \"sf\", \"dplyr\", \"ggplot2\", \"reticulate\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6db9d75a-e1b0-410b-b229-de124f035375", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set variables to load openhexa.sdk from the right environment\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load openhexa.sdk\n", + "print(reticulate::py_config()$python)\n", + "tryCatch({ \n", + " openhexa <- import(\"openhexa.sdk\") \n", + "},\n", + "error = function(e) {\n", + " msg <- paste0(\"Error while loading openhexa.sdk\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80e3c014-3830-44cf-a3a8-0cfaee513b6d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "# DHIS2 Dataset extract identifier\n", + "worldpop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$WORLDPOP_DATASET_EXTRACT\n", + "dhis2_formatted_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" + ] + }, + { + "cell_type": "markdown", + "id": "7bea2d76-dbbd-4577-ab46-bd6fc1c3142a", + "metadata": {}, + "source": [ + "### Load data " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae98618e-d3f9-4626-a6da-8a11922b90c7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Select the parquet file from the WorldPop dataset.\n", + "dataset_last_version <- openhexa$workspace$get_dataset(worldpop_dataset)$latest_version\n", + "if (is.null(dataset_last_version)) {\n", + " stop(\"No version available in SNT WorldPop dataset. Process stopped.\")\n", + "}\n", + "\n", + "parquet_file <- find_country_parquet_file(dataset_last_version, COUNTRY_CODE)\n", + "if (is.null(parquet_file)) {\n", + " stop(\"No *.parquet file available in SNT WorldPop dataset. Process stopped.\")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9dd60960-22e3-49d0-bebc-c0c121f86daf", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load worldpop population\n", + "worldpop_population <- tryCatch({ get_latest_dataset_file_in_memory(worldpop_dataset, parquet_file) },\n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading WorldPop population file \",parquet_file,\" for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "msg <- paste0(\"WorldPop population file \",parquet_file,\" loaded from dataset : \", worldpop_dataset, \" dataframe dimensions: \", paste(dim(worldpop_population), collapse=\", \"))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de2af2a9-159f-4ffd-969f-9e5f52ac6520", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load DHIS2 population\n", + "dhis2_population <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_formatted_dataset, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 population file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "msg <- paste0(\"DHIS2 population data loaded from dataset : \", dhis2_formatted_dataset, \" dataframe dimensions: \", paste(dim(dhis2_population), collapse=\", \"))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "844931d6-4608-49ca-9151-73705c69944e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load DHIS2 shapes data\n", + "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_formatted_dataset, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "msg <- paste0(\"DHIS2 shapes data loaded from dataset : \", dhis2_formatted_dataset, \" dataframe dimensions: \", paste(dim(shapes_data), collapse=\", \"))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "markdown", + "id": "6394d646-4d4e-4245-a791-6b86c3f1c25e", + "metadata": {}, + "source": [ + "## Zone de sante comparison" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0aefc590-c4e3-4c23-9b4b-e42ac144c27e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# select the closest year (Worldpop does not update data since 2020)\n", + "year_selection <- get_comparison_years(worldpop_population, dhis2_population)\n", + "worldpop_year <- year_selection$worldpop_year\n", + "dhis2_year <- year_selection$dhis2_year\n", + "\n", + "print(paste0(\"Comparison years DHIS2: \", dhis2_year, \" Worldpop : \", worldpop_year))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d946ffaa-14e7-41aa-9ab8-29656b440e19", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 5. Compare WorldPop vs DHIS2 (if you have a matching ID column)\n", + "comparison_df <- build_adm2_comparison(\n", + " shapes_data = shapes_data,\n", + " dhis2_population = dhis2_population,\n", + " worldpop_population = worldpop_population,\n", + " dhis2_year = dhis2_year,\n", + " worldpop_year = worldpop_year\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d49ccb1-218f-4e1d-a8c1-5e15b9488202", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(comparison_df, aes(x = dhis2_value, y = worldpop_value)) +\n", + " geom_point() +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"gray\") +\n", + " labs(x = \"DHIS2 Population\", \n", + " y = \"WorldPop Population\", \n", + " title = \"Comparison per ADM2\") +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a26dc148-6f45-45e1-a42a-cb8610a280ee", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "pop_correlation <- cor(comparison_df$dhis2_value, comparison_df$worldpop_value, method = 'pearson')\n", + "print(paste0(\"Correlation : \", round(pop_correlation, 2)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b275bbf1-eb89-4e1a-951c-f7fdd014c5aa", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(comparison_df) +\n", + " geom_sf(aes(fill = worldpop_value - dhis2_value)) +\n", + " scale_fill_gradient2(low = \"blue\", mid = \"white\", high = \"red\", midpoint = 0) +\n", + " labs(title = \"Difference: WorldPop - DHIS2 Population (ADM2)\",\n", + " fill = \"Pop. Diff\") +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df524f99-5268-4d41-afdb-9596b39b269e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "sapply(\n", + " setNames(\n", + " list(comparison_df$dhis2_value, comparison_df$worldpop_value),\n", + " c(\"DHIS2_population\", \"WPOP_population\")\n", + " ),\n", + " summary\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4878bd53-dc03-46ee-8a15-65e90de07221", + "metadata": {}, + "source": [ + "The above table shows that some of the values of WPOP tend to be less plausible, with some Zones de Santé (ZS) having 0 inhabitants, while the largest ZS, which is likely one of the districts of Kinshasa, appears with nearly 22 million inhabitants, thus more than the total population of Kinshasa (17 million). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "477baa64-a3b4-43aa-9a7b-c2ef31f1f0f7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "comp.pop <- comparison_df %>%\n", + " select(ADM2_ID, worldpop_value, dhis2_value) %>%\n", + " mutate(\n", + " diff = worldpop_value - dhis2_value,\n", + " ratio = worldpop_value / dhis2_value,\n", + " relative_diff = diff / dhis2_value\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51534fe9-22dc-4b31-abdf-5fd6a8ba783d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist_pop <- st_drop_geometry(comp.pop) %>%\n", + " select(ADM2_ID, worldpop_value, dhis2_value) %>%\n", + " rename(\n", + " WPOP = worldpop_value,\n", + " DHIS2 = dhis2_value\n", + " ) %>%\n", + " pivot_longer(\n", + " cols = c(WPOP, DHIS2),\n", + " names_to = \"source\",\n", + " values_to = \"population\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a991e14-a68d-4df1-8dbe-651693824d5d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(hist_pop, aes(x=population, color=source)) +\n", + " geom_histogram(fill = NA, alpha = 0.5, position = \"identity\", binwidth = 10000) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "markdown", + "id": "16dc7bee-fea1-4d29-b23d-196771b2f094", + "metadata": {}, + "source": [ + "WorldPop data appears to have more variability and some extreme outliers to the right." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77028457-1fca-48b2-9e99-e233bf9286bc", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(comp.pop, aes(y=relative_diff)) + \n", + " geom_boxplot() + coord_flip() + theme_minimal()" + ] + }, + { + "cell_type": "markdown", + "id": "dbe337de-e8e0-4409-a026-fb5a2232636e", + "metadata": {}, + "source": [ + "The above plot shows that for 75% of Zones de Santé (ZS), the difference between DHIS2 and WorldPop data is within 50% of the total DHIS2, which indicates significant deviation between the two sources. Most of the remaining ZS (except the outliers represented as dots) have differences of population that are 0.5-1.5 of the total DHIS2 population." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b60680c3-4696-48af-9bba-b51210c187d8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(comparison_df) +\n", + " geom_sf(aes(fill = (worldpop_value - dhis2_value)/dhis2_value)) +\n", + " scale_fill_gradient2(low = \"blue\", mid = \"white\", high = \"red\", midpoint = 0) +\n", + " labs(title = \"ADM2 Relative difference: WorldPop - DHIS2 Population (reference)\",\n", + " fill = \"Relative Pop. Diff\") +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "markdown", + "id": "bb54485d-e3b1-4f5d-83b0-a20981e233e9", + "metadata": {}, + "source": [ + "## Province comparison" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7d250c2-7353-4926-b0b5-7f7ea81005ec", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "comparison_df_prov <- build_adm1_comparison(\n", + " shapes_data = shapes_data,\n", + " dhis2_population = dhis2_population,\n", + " worldpop_population = worldpop_population\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "353031e2-475e-4478-b643-aa7e86885365", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(comparison_df_prov, aes(x = dhis2_value, y = worldpop_value)) +\n", + " geom_point() +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"gray\") +\n", + " labs(x = \"DHIS2 Population\", y = \"WorldPop Population\", \n", + " title = \"Comparison per ADM2\") +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "931a0bb7-3a2b-4909-96b2-afced4852fa7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "pop_correlation_prov <- cor(comparison_df_prov$dhis2_value, comparison_df_prov$worldpop_value, method = 'pearson')\n", + "print(paste0(\"Correlation : \", round(pop_correlation_prov, 2)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0d79532-9cc0-4678-a23d-7920abe3ed14", + "metadata": { + "scrolled": true, + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(comparison_df_prov) +\n", + " geom_sf(aes(fill = worldpop_value - dhis2_value)) +\n", + " scale_fill_gradient2(low = \"blue\", mid = \"white\", high = \"red\", midpoint = 0) +\n", + " labs(title = \"Difference: WorldPop - DHIS2 Population (ADM1)\",\n", + " fill = \"Pop. Diff\") +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fb1f4e1-a5b4-4685-9ad8-0510f8565659", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "sapply(\n", + " setNames(\n", + " list(comparison_df_prov$dhis2_value, comparison_df_prov$worldpop_value),\n", + " c(\"DHIS2_population\", \"WPOP_population\")\n", + " ),\n", + " summary\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fcc0f64-78b5-4d87-93f0-5783ec2aece1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "310b555f-69a6-44a9-920d-34b517e72804", - "metadata": {}, - "outputs": [], - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - " \n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"tidyr\", \"terra\", \"arrow\", \"sf\", \"dplyr\", \"ggplot2\", \"reticulate\") # reticulate\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6db9d75a-e1b0-410b-b229-de124f035375", - "metadata": {}, - "outputs": [], - "source": [ - "# Set variables to load openhexa.sdk from the right environment\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load openhexa.sdk\n", - "print(reticulate::py_config()$python)\n", - "tryCatch({ \n", - " openhexa <- import(\"openhexa.sdk\") \n", - "},\n", - "error = function(e) {\n", - " msg <- paste0(\"Error while loading openhexa.sdk\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "80e3c014-3830-44cf-a3a8-0cfaee513b6d", - "metadata": {}, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "# DHIS2 Dataset extract identifier\n", - "worldpop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$WORLDPOP_DATASET_EXTRACT\n", - "dhis2_formatted_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "markdown", - "id": "7bea2d76-dbbd-4577-ab46-bd6fc1c3142a", - "metadata": {}, - "source": [ - "### Load data " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae98618e-d3f9-4626-a6da-8a11922b90c7", - "metadata": {}, - "outputs": [], - "source": [ - "# Select the parquet file from the WorldPop dataset.\n", - "dataset_last_version <- openhexa$workspace$get_dataset(worldpop_dataset)$latest_version\n", - "if (is.null(dataset_last_version)) {\n", - " stop(\"No version available in SNT WorldPop dataset. Process stopped.\")\n", - "}\n", - "\n", - "parquet_file <- NULL\n", - "files_list <- reticulate::iterate(dataset_last_version$files)\n", - "for (file in files_list) {\n", - " if (endsWith(file$filename, \".parquet\")) {\n", - " parquet_file <- file$filename\n", - " parquet_file <- paste0(COUNTRY_CODE, \"_\", substring(parquet_file, 5)) # Make sure we select the country file.\n", - " print(paste0(\"Parquet file found: \", parquet_file))\n", - " }\n", - "}\n", - "\n", - "if (is.null(parquet_file)) {\n", - " stop(\"No *.parquet file available in SNT WorldPop dataset. Process stopped.\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9dd60960-22e3-49d0-bebc-c0c121f86daf", - "metadata": {}, - "outputs": [], - "source": [ - "# Load worldpop population\n", - "worldpop_population <- tryCatch({ get_latest_dataset_file_in_memory(worldpop_dataset, parquet_file) },\n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading WorldPop population file \",parquet_file,\" for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"WorldPop population file \",parquet_file,\" loaded from dataset : \", worldpop_dataset, \" dataframe dimensions: \", paste(dim(worldpop_population), collapse=\", \"))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de2af2a9-159f-4ffd-969f-9e5f52ac6520", - "metadata": {}, - "outputs": [], - "source": [ - "# Load DHIS2 population\n", - "dhis2_population <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_formatted_dataset, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 population file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"DHIS2 population data loaded from dataset : \", dhis2_formatted_dataset, \" dataframe dimensions: \", paste(dim(dhis2_population), collapse=\", \"))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "844931d6-4608-49ca-9151-73705c69944e", - "metadata": {}, - "outputs": [], - "source": [ - "# Load DHIS2 shapes data\n", - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_formatted_dataset, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "msg <- paste0(\"DHIS2 shapes data loaded from dataset : \", dhis2_formatted_dataset, \" dataframe dimensions: \", paste(dim(shapes_data), collapse=\", \"))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "markdown", - "id": "6394d646-4d4e-4245-a791-6b86c3f1c25e", - "metadata": {}, - "source": [ - "## Zone de sante comparison" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0aefc590-c4e3-4c23-9b4b-e42ac144c27e", - "metadata": {}, - "outputs": [], - "source": [ - "# select the closest year (Worldpop does not update data since 2020)\n", - "worldpop_year <- min(worldpop_population$YEAR) # min year\n", - "if (worldpop_year %in% unique(dhis2_population$YEAR)){\n", - " dhis2_year <- worldpop_year\n", - "} else if(worldpop_year < min(dhis2_population$YEAR)){\n", - " dhis2_year <- min(dhis2_population$YEAR)\n", - "} else {\n", - " dhis2_year <- max(dhis2_population$YEAR)\n", - "}\n", - "\n", - "print(paste0(\"Comparison years DHIS2: \",dhis2_year, \" Worldpop : \", worldpop_year))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d946ffaa-14e7-41aa-9ab8-29656b440e19", - "metadata": {}, - "outputs": [], - "source": [ - "# Select DHIS2 data to closest year\n", - "dhis2_pop_renamed <- dhis2_population %>% \n", - " filter(YEAR == dhis2_year) %>% \n", - " select(ADM2_ID, dhis2_value = POPULATION)\n", - "\n", - "worldpop_pop_renamed <- worldpop_population %>% \n", - " filter(YEAR == worldpop_year) %>% \n", - " select(ADM2_ID, worldpop_value = POPULATION)\n", - "\n", - "# 5. Compare WorldPop vs DHIS2 (if you have a matching ID column)\n", - "comparison_df <- left_join(shapes_data, dhis2_pop_renamed[, c(\"ADM2_ID\", \"dhis2_value\")], by = \"ADM2_ID\")\n", - "comparison_df <- left_join(comparison_df, worldpop_pop_renamed[, c(\"ADM2_ID\", \"worldpop_value\")], by = \"ADM2_ID\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d49ccb1-218f-4e1d-a8c1-5e15b9488202", - "metadata": {}, - "outputs": [], - "source": [ - "ggplot(comparison_df, aes(x = dhis2_value, y = worldpop_value)) +\n", - " geom_point() +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"gray\") +\n", - " labs(x = \"DHIS2 Population\", \n", - " y = \"WorldPop Population\", \n", - " title = \"Comparison per ADM2\") +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a26dc148-6f45-45e1-a42a-cb8610a280ee", - "metadata": {}, - "outputs": [], - "source": [ - "pop_correlation <- cor(comparison_df$dhis2_value, comparison_df$worldpop_value, method = 'pearson')\n", - "print(paste0(\"Correlation : \", round(pop_correlation, 2)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b275bbf1-eb89-4e1a-951c-f7fdd014c5aa", - "metadata": {}, - "outputs": [], - "source": [ - "ggplot(comparison_df) +\n", - " geom_sf(aes(fill = worldpop_value - dhis2_value)) +\n", - " scale_fill_gradient2(low = \"blue\", mid = \"white\", high = \"red\", midpoint = 0) +\n", - " labs(title = \"Difference: WorldPop - DHIS2 Population (ADM2)\",\n", - " fill = \"Pop. Diff\") +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df524f99-5268-4d41-afdb-9596b39b269e", - "metadata": {}, - "outputs": [], - "source": [ - "sapply(\n", - " setNames(\n", - " list(comparison_df$dhis2_value, comparison_df$worldpop_value),\n", - " c(\"DHIS2_population\", \"WPOP_population\")\n", - " ),\n", - " summary\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "4878bd53-dc03-46ee-8a15-65e90de07221", - "metadata": {}, - "source": [ - "The above table shows that some of the values of WPOP tend to be less plausible, with some Zones de Santé (ZS) having 0 inhabitants, while the largest ZS, which is likely one of the districts of Kinshasa, appears with nearly 22 million inhabitants, thus more than the total population of Kinshasa (17 million). " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "477baa64-a3b4-43aa-9a7b-c2ef31f1f0f7", - "metadata": {}, - "outputs": [], - "source": [ - "comp.pop <- comparison_df %>%\n", - " select(ADM2_ID, worldpop_value, dhis2_value) %>%\n", - " mutate(\n", - " diff = worldpop_value - dhis2_value,\n", - " ratio = worldpop_value / dhis2_value,\n", - " relative_diff = diff / dhis2_value\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51534fe9-22dc-4b31-abdf-5fd6a8ba783d", - "metadata": {}, - "outputs": [], - "source": [ - "hist_pop <- st_drop_geometry(comp.pop) %>%\n", - " select(ADM2_ID, worldpop_value, dhis2_value) %>%\n", - " rename(\n", - " WPOP = worldpop_value,\n", - " DHIS2 = dhis2_value\n", - " ) %>%\n", - " pivot_longer(\n", - " cols = c(WPOP, DHIS2),\n", - " names_to = \"source\",\n", - " values_to = \"population\"\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a991e14-a68d-4df1-8dbe-651693824d5d", - "metadata": {}, - "outputs": [], - "source": [ - "ggplot(hist_pop, aes(x=population, color=source)) +\n", - " geom_histogram(fill = NA, alpha = 0.5, position = \"identity\", binwidth = 10000) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "markdown", - "id": "16dc7bee-fea1-4d29-b23d-196771b2f094", - "metadata": {}, - "source": [ - "WorldPop data appears to have more variability and some extreme outliers to the right." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77028457-1fca-48b2-9e99-e233bf9286bc", - "metadata": {}, - "outputs": [], - "source": [ - "ggplot(comp.pop, aes(y=relative_diff)) + \n", - " geom_boxplot() + coord_flip() + theme_minimal()" - ] - }, - { - "cell_type": "markdown", - "id": "dbe337de-e8e0-4409-a026-fb5a2232636e", - "metadata": {}, - "source": [ - "The above plot shows that for 75% of Zones de Santé (ZS), the difference between DHIS2 and WorldPop data is within 50% of the total DHIS2, which indicates significant deviation between the two sources. Most of the remaining ZS (except the outliers represented as dots) have differences of population that are 0.5-1.5 of the total DHIS2 population." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b60680c3-4696-48af-9bba-b51210c187d8", - "metadata": {}, - "outputs": [], - "source": [ - "ggplot(comparison_df) +\n", - " geom_sf(aes(fill = (worldpop_value - dhis2_value)/dhis2_value)) +\n", - " scale_fill_gradient2(low = \"blue\", mid = \"white\", high = \"red\", midpoint = 0) +\n", - " labs(title = \"ADM2 Relative difference: WorldPop - DHIS2 Population (reference)\",\n", - " fill = \"Relative Pop. Diff\") +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "markdown", - "id": "bb54485d-e3b1-4f5d-83b0-a20981e233e9", - "metadata": {}, - "source": [ - "## Province comparison" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7d250c2-7353-4926-b0b5-7f7ea81005ec", - "metadata": {}, - "outputs": [], - "source": [ - "# Group by province (ADM1) and dissolve geometries\n", - "dhis2_shapes_provinces <- shapes_data %>%\n", - " group_by(ADM1_ID) %>%\n", - " summarise(geometry = st_union(geometry), .groups = \"drop\")\n", - "\n", - "# Group pop by provinces\n", - "dhis2_pop_prov <- dhis2_population %>%\n", - " group_by(ADM1_NAME, ADM1_ID) %>%\n", - " summarise(dhis2_value = sum(POPULATION, na.rm = TRUE))\n", - "\n", - "# Group pop by provinces\n", - "worldpop_pop_prov <- worldpop_population %>%\n", - " group_by(ADM1_NAME, ADM1_ID) %>%\n", - " summarise(worldpop_value = sum(POPULATION, na.rm = TRUE))\n", - "\n", - "comparison_df_prov <- left_join(dhis2_shapes_provinces, dhis2_pop_prov, by = c(\"ADM1_ID\"))\n", - "comparison_df_prov <- left_join(comparison_df_prov, worldpop_pop_prov, by = c(\"ADM1_ID\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "353031e2-475e-4478-b643-aa7e86885365", - "metadata": {}, - "outputs": [], - "source": [ - "ggplot(comparison_df_prov, aes(x = dhis2_value, y = worldpop_value)) +\n", - " geom_point() +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"gray\") +\n", - " labs(x = \"DHIS2 Population\", y = \"WorldPop Population\", \n", - " title = \"Comparison per ADM2\") +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "931a0bb7-3a2b-4909-96b2-afced4852fa7", - "metadata": {}, - "outputs": [], - "source": [ - "pop_correlation_prov <- cor(comparison_df_prov$dhis2_value, comparison_df_prov$worldpop_value, method = 'pearson')\n", - "print(paste0(\"Correlation : \", round(pop_correlation_prov, 2)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c0d79532-9cc0-4678-a23d-7920abe3ed14", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "ggplot(comparison_df_prov) +\n", - " geom_sf(aes(fill = worldpop_value - dhis2_value)) +\n", - " scale_fill_gradient2(low = \"blue\", mid = \"white\", high = \"red\", midpoint = 0) +\n", - " labs(title = \"Difference: WorldPop - DHIS2 Population (ADM1)\",\n", - " fill = \"Pop. Diff\") +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3fb1f4e1-a5b4-4685-9ad8-0510f8565659", - "metadata": {}, - "outputs": [], - "source": [ - "sapply(\n", - " setNames(\n", - " list(comparison_df_prov$dhis2_value, comparison_df_prov$worldpop_value),\n", - " c(\"DHIS2_population\", \"WPOP_population\")\n", - " ),\n", - " summary\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fcc0f64-78b5-4d87-93f0-5783ec2aece1", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_worldpop_extract/utils/snt_worldpop_extract_report.r b/pipelines/snt_worldpop_extract/utils/snt_worldpop_extract_report.r new file mode 100644 index 0000000..c9dda99 --- /dev/null +++ b/pipelines/snt_worldpop_extract/utils/snt_worldpop_extract_report.r @@ -0,0 +1,56 @@ +find_country_parquet_file <- function(dataset_last_version, country_code) { + parquet_file <- NULL + files_list <- reticulate::iterate(dataset_last_version$files) + for (file in files_list) { + if (endsWith(file$filename, ".parquet")) { + parquet_file <- paste0(country_code, "_", substring(file$filename, 5)) + print(paste0("Parquet file found: ", parquet_file)) + } + } + parquet_file +} + + +get_comparison_years <- function(worldpop_population, dhis2_population) { + worldpop_year <- min(worldpop_population$YEAR) + if (worldpop_year %in% unique(dhis2_population$YEAR)) { + dhis2_year <- worldpop_year + } else if (worldpop_year < min(dhis2_population$YEAR)) { + dhis2_year <- min(dhis2_population$YEAR) + } else { + dhis2_year <- max(dhis2_population$YEAR) + } + list(worldpop_year = worldpop_year, dhis2_year = dhis2_year) +} + + +build_adm2_comparison <- function(shapes_data, dhis2_population, worldpop_population, dhis2_year, worldpop_year) { + dhis2_pop_renamed <- dhis2_population %>% + dplyr::filter(YEAR == dhis2_year) %>% + dplyr::select(ADM2_ID, dhis2_value = POPULATION) + + worldpop_pop_renamed <- worldpop_population %>% + dplyr::filter(YEAR == worldpop_year) %>% + dplyr::select(ADM2_ID, worldpop_value = POPULATION) + + comparison_df <- dplyr::left_join(shapes_data, dhis2_pop_renamed[, c("ADM2_ID", "dhis2_value")], by = "ADM2_ID") + dplyr::left_join(comparison_df, worldpop_pop_renamed[, c("ADM2_ID", "worldpop_value")], by = "ADM2_ID") +} + + +build_adm1_comparison <- function(shapes_data, dhis2_population, worldpop_population) { + dhis2_shapes_provinces <- shapes_data %>% + dplyr::group_by(ADM1_ID) %>% + dplyr::summarise(geometry = sf::st_union(geometry), .groups = "drop") + + dhis2_pop_prov <- dhis2_population %>% + dplyr::group_by(ADM1_NAME, ADM1_ID) %>% + dplyr::summarise(dhis2_value = sum(POPULATION, na.rm = TRUE), .groups = "drop") + + worldpop_pop_prov <- worldpop_population %>% + dplyr::group_by(ADM1_NAME, ADM1_ID) %>% + dplyr::summarise(worldpop_value = sum(POPULATION, na.rm = TRUE), .groups = "drop") + + comparison_df_prov <- dplyr::left_join(dhis2_shapes_provinces, dhis2_pop_prov, by = c("ADM1_ID")) + dplyr::left_join(comparison_df_prov, worldpop_pop_prov, by = c("ADM1_ID")) +} diff --git a/snt_dhs_indicators/pipeline.py b/snt_dhs_indicators/pipeline.py index ba6bbec..e1eba10 100644 --- a/snt_dhs_indicators/pipeline.py +++ b/snt_dhs_indicators/pipeline.py @@ -55,6 +55,8 @@ def dhs_indicators(run_reports_only: bool, pull_scripts: bool) -> None: "snt_dhs_mortality_computation.ipynb", "snt_dhs_prevalence_computation.ipynb", "snt_dhs_vaccination_computation.ipynb", + "utils/snt_dhs_indicator_tables.r", + "utils/snt_dhs_careseeking_computation.r", ], ) @@ -223,6 +225,7 @@ def run_dhs_indicator_notebooks( computation_notebook_name (str): Filename of the computation notebook. reporting_notebook_name (str): Filename of the reporting notebook. run_report_only (bool): If True, only the reporting notebook will be executed. + country_code (str | None): Country code used for notebook execution context. """ computation_notebook_path = pipeline_root_path / "code" / computation_notebook_name From e8a72fbe0aa09d69b8e1bb637c309c1c99462a8b Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 31 Mar 2026 16:56:51 +0200 Subject: [PATCH 21/23] little fix --- .../code/snt_dhs_prevalence_computation.ipynb | 152 +++++++++--------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb index d7b950b..60d8c5f 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb @@ -2,22 +2,23 @@ "cells": [ { "cell_type": "markdown", + "id": "1a337757-f2fa-467e-8241-ea174c7ea790", "metadata": {}, "source": [ "# Under-5 Prevalence of Malaria (DHS data)" - ], - "id": "1a337757-f2fa-467e-8241-ea174c7ea790" + ] }, { "cell_type": "markdown", + "id": "fc27d9c1-0c0c-46df-9508-57add133acaf", "metadata": {}, "source": [ "## Resources" - ], - "id": "fc27d9c1-0c0c-46df-9508-57add133acaf" + ] }, { "cell_type": "markdown", + "id": "9cf1724b-b01c-4b2e-93f7-d0f54cfa4850", "metadata": {}, "source": [ "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Prevalence_of_Malaria_in_Children.htm%23Percentage_of_children22bc-1&rhtocid=_15_13_0\n", @@ -29,32 +30,34 @@ "Denominators:\n", "a) Number of de facto children tested using RDT (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 in 0,1)\n", "b) Number of de facto children tested using microscopy (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 in 0,1,6)" - ], - "id": "9cf1724b-b01c-4b2e-93f7-d0f54cfa4850" + ] }, { "cell_type": "markdown", + "id": "7c61058a-361f-4992-9f2b-99a82f798fd8", "metadata": {}, "source": [ "**Project uses RDT**" - ], - "id": "7c61058a-361f-4992-9f2b-99a82f798fd8" + ] }, { "cell_type": "markdown", + "id": "d0e715b0-9a8d-4d15-b7ef-486f16fec73b", "metadata": {}, "source": [ "## Preliminary steps" - ], - "id": "d0e715b0-9a8d-4d15-b7ef-486f16fec73b" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "a3cc2680-f9cc-46cd-9565-2b8af14fc29a", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "rm(list = ls())\n", "\n", @@ -101,43 +104,43 @@ "\n", "# Set config variables\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n" - ], - "execution_count": null, - "outputs": [], - "id": "a3cc2680-f9cc-46cd-9565-2b8af14fc29a" + ] }, { "cell_type": "markdown", + "id": "43135a7a-f1b8-4a89-9db1-9012c3369f3d", "metadata": {}, "source": [ "## Get data" - ], - "id": "43135a7a-f1b8-4a89-9db1-9012c3369f3d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "7e83a6c1-cd47-4eeb-b949-7bef7d1f36c6", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "admin_level <- 'ADM1'\n", "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", "admin_cols <- c(admin_id_col, admin_name_col)" - ], - "execution_count": null, - "outputs": [], - "id": "7e83a6c1-cd47-4eeb-b949-7bef7d1f36c6" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9a623578-170f-42e0-a012-78f0ddbcce87", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Load spatial file from dataset\n", "\n", @@ -166,26 +169,26 @@ "\n", "admin_data <- st_drop_geometry(spatial_data)\n", "setDT(admin_data)" - ], - "execution_count": null, - "outputs": [], - "id": "9a623578-170f-42e0-a012-78f0ddbcce87" + ] }, { "cell_type": "markdown", + "id": "11aa09f3-ecc8-46c0-b9f3-95c0520202ef", "metadata": {}, "source": [ "### Import " - ], - "id": "11aa09f3-ecc8-46c0-b9f3-95c0520202ef" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c015490b-a8fc-471d-a83f-0dce8e010cef", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "indicator_u5prev <- 'PCT_U5_PREV_RDT' # to be computed\n", "\n", @@ -220,26 +223,26 @@ "\n", "dhs_pr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_pr_filename)) # person recode\n", "dhs_pr_dt <- setDT(dhs_pr_dt)" - ], - "execution_count": null, - "outputs": [], - "id": "c015490b-a8fc-471d-a83f-0dce8e010cef" + ] }, { "cell_type": "markdown", + "id": "8b191b87-3694-4883-99c9-66ade3477f8e", "metadata": {}, "source": [ "### Make admin dataframe (for future merging)" - ], - "id": "8b191b87-3694-4883-99c9-66ade3477f8e" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "e0767159-1660-48c3-bdaf-f8be3643e039", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "dhs_admin_dt <- make_dhs_admin_df(\n", " input_dhs_df=dhs_hr_dt,\n", @@ -260,26 +263,26 @@ "}\n", "\n", "rm(dhs_hr_dt) # free up resources" - ], - "execution_count": null, - "outputs": [], - "id": "e0767159-1660-48c3-bdaf-f8be3643e039" + ] }, { "cell_type": "markdown", + "id": "8a47d5f6-f8d8-4373-9cc8-a776d85b8a75", "metadata": {}, "source": [ "### Preprocess" - ], - "id": "8a47d5f6-f8d8-4373-9cc8-a776d85b8a75" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f49ecb55-da48-4cf5-aa2b-7691eed04b77", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Relevant columns\n", "household_id_cols <- c(\"HHID\", \"HV000\", \"HV001\", \"HV002\")\n", @@ -289,18 +292,18 @@ "kid_age_col <- \"HC1\"\n", "smear_result_col <- \"HML32\" # smear test (GE)\n", "rdt_result_col <- \"HML35\" # rapid diagnostic test (RDT / TDR)" - ], - "execution_count": null, - "outputs": [], - "id": "f49ecb55-da48-4cf5-aa2b-7691eed04b77" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "a5fe638a-7418-4049-822e-6c3d715fdded", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# sapply(kid_age_cols, function(i) table(dhs_pr_dt[[i]], useNA = 'always'))\n", "\n", @@ -329,26 +332,26 @@ "pr_dt[, wt := HV005/1000000]\n", "\n", "pr_dt <- merge.data.table(dhs_admin_dt, pr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"HV024\", all = TRUE)" - ], - "execution_count": null, - "outputs": [], - "id": "a5fe638a-7418-4049-822e-6c3d715fdded" + ] }, { "cell_type": "markdown", + "id": "7fa09c3c-d462-42f0-830a-444c79addf86", "metadata": {}, "source": [ "## Rapid Diagnostic Test Indicator" - ], - "id": "7fa09c3c-d462-42f0-830a-444c79addf86" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "af0796b0-6f8e-40a0-8a41-e837ce0b02ee", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "xtabs( ~ get(rdt_result_col), data = pr_dt, addNA = TRUE)\n", "\n", @@ -366,18 +369,18 @@ " num_p=1, # ? dunno what this is\n", " nest = T # the primary sampling units are nested within the strata\n", ")" - ], - "execution_count": null, - "outputs": [], - "id": "af0796b0-6f8e-40a0-8a41-e837ce0b02ee" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "028130f5-8522-4a2f-81d9-25630affdab3", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}\")\n", "malaria_rdt_table <- compute_and_export_indicator_table(\n", @@ -389,78 +392,75 @@ " output_data_path = OUTPUT_DATA_PATH,\n", " filename_without_extension = filename_without_extension\n", ")" - ], - "execution_count": null, - "outputs": [], - "id": "028130f5-8522-4a2f-81d9-25630affdab3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "bcef2f47-d0d8-4cea-8477-a192b92bb9a8", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# prevalence table computed and exported in previous cell." - ], - "execution_count": null, - "outputs": [], - "id": "bcef2f47-d0d8-4cea-8477-a192b92bb9a8" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "e975d45b-1e32-442c-a859-d65ad3db904c", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# handled by compute_and_export_indicator_table()" - ], - "execution_count": null, - "outputs": [], - "id": "e975d45b-1e32-442c-a859-d65ad3db904c" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "706e8de0-643f-4b68-a9e7-8c9dcd7bfba7", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# handled by compute_and_export_indicator_table()" - ], - "execution_count": null, - "outputs": [], - "id": "706e8de0-643f-4b68-a9e7-8c9dcd7bfba7" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "be99a596-2647-4399-979f-4fd5855bd7cf", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# already merged and exported above" - ], - "execution_count": null, - "outputs": [], - "id": "be99a596-2647-4399-979f-4fd5855bd7cf" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "d8d7236b-bf86-4da4-bc15-ca97b57b5ba4", "metadata": { "vscode": { "languageId": "r" } }, - "source": [], - "execution_count": null, "outputs": [], - "id": "d8d7236b-bf86-4da4-bc15-ca97b57b5ba4" + "source": [] } ], "metadata": { @@ -480,4 +480,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From 9f87eb061e549cdde243053b9f15a35a452cb186 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 2 Apr 2026 11:14:56 +0200 Subject: [PATCH 22/23] more cleaning --- .../code/NER_pyramid_format.ipynb | 20 +- .../utils/snt_dhis2_extract.r | 30 + .../snt_dhis2_population_transformation.ipynb | 20 +- .../snt_dhis2_population_transformation.r | 29 + ...snt_dhis2_reporting_rate_dataelement.ipynb | 2056 ++++++++--------- .../code/snt_dhs_bednets_computation.ipynb | 39 +- .../snt_dhs_careseeking_computation.ipynb | 41 +- .../code/snt_dhs_mortality_computation.ipynb | 737 +++--- .../code/snt_dhs_prevalence_computation.ipynb | 41 +- .../snt_dhs_vaccination_computation.ipynb | 38 +- .../utils/snt_dhs_indicator_tables.r | 52 + .../code/snt_healthcare_access.ipynb | 22 +- .../utils/snt_healthcare_access.r | 38 + .../code/snt_seasonality_cases.ipynb | 43 +- .../utils/snt_seasonality_cases.r | 38 + 15 files changed, 1655 insertions(+), 1589 deletions(-) diff --git a/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb b/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb index be611d1..d29c52f 100644 --- a/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb +++ b/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb @@ -34,22 +34,12 @@ "outputs": [], "source": [ "# PROJECT PATHS\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code')\n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_extract\")\n", + "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_extract\")\n", "\n", - "# Load snt utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_extract.r\"))\n", - "\n", - "# Load libraries\n", - "required_packages <- c(\"arrow\", \"dplyr\", \"tidyverse\", \"jsonlite\", \"reticulate\", \"glue\", \"sf\")\n", - "install_and_load(required_packages)\n", - "\n", - "# # Load openhexa.sdk and set environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "openhexa_toolbox <- import(\"openhexa.toolbox\")" + "setup_ctx <- bootstrap_dhis2_extract_context(root_path = ROOT_PATH)\n", + "SNT_ROOT_PATH <- setup_ctx$ROOT_PATH" ] }, { @@ -128,7 +118,7 @@ " 'sB6YOzTUHkF', 'oihgQahh9LH', 'DrQFMU6RoCG', 'pwD7FU7Qfyz', 'dgDPQhxqOcJ','Gox5G2BIGBf')\n", "\n", "# Create ou groups\n", - "ou_groups = read_parquet(file.path(SNT_ROOT_PATH, \"data/dhis2/extracts_raw/organisation_unit_groups/NER_organisation_unit_groups_raw.parquet\")) # hardcoded\n", + "ou_groups <- read_parquet(file.path(SNT_ROOT_PATH, \"data\", \"dhis2\", \"extracts_raw\", \"organisation_unit_groups\", \"NER_organisation_unit_groups_raw.parquet\"))\n", "ou_groups_exploded <- unnest(ou_groups, cols = c(organisation_units)) \n", "ou_selection <- ou_groups_exploded[ou_groups_exploded$id %in% liste_groupes_prioritaires, ]\n", "group_prioritaires_table <- pyramid_df[pyramid_df$id %in% unique(ou_selection$organisation_units), ]" diff --git a/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r b/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r index 54304b7..9010dcc 100644 --- a/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r +++ b/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r @@ -1,5 +1,35 @@ # Shared helpers for snt_dhis2_extract notebooks. +bootstrap_dhis2_extract_context <- function( + root_path = "~/workspace", + required_packages = c("arrow", "dplyr", "tidyverse", "jsonlite", "reticulate", "glue", "sf"), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + pipeline_path <- file.path(root_path, "pipelines", "snt_dhis2_extract") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + openhexa <- NULL + openhexa_toolbox <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + openhexa_toolbox <- reticulate::import("openhexa.toolbox") + } + assign("openhexa", openhexa, envir = .GlobalEnv) + assign("openhexa_toolbox", openhexa_toolbox, envir = .GlobalEnv) + + list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + PIPELINE_PATH = pipeline_path, + openhexa = openhexa, + openhexa_toolbox = openhexa_toolbox + ) +} + printdim <- function(df, name = deparse(substitute(df))) { cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") } diff --git a/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb b/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb index 3135161..0b4457f 100644 --- a/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb +++ b/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb @@ -35,23 +35,17 @@ "outputs": [], "source": [ "# Set project folders\n", - "SNT_ROOT_PATH <- '~/workspace'\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_population_transformation\")\n", - "POPULATION_DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\", \"population_transformed\")\n", + "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_population_transformation\")\n", "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\")) # utils\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_population_transformation.r\"))\n", + "setup_ctx <- bootstrap_population_transformation_context(root_path = ROOT_PATH)\n", "\n", - "# List required pcks\n", - "required_packages <- c(\"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"glue\", \"reticulate\", \"rlang\")\n", - "install_and_load(required_packages)\n", + "SNT_ROOT_PATH <- setup_ctx$ROOT_PATH\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "POPULATION_DATA_PATH <- setup_ctx$POPULATION_DATA_PATH\n", "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "reticulate::py_config()$python" ] }, { diff --git a/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r b/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r index f610951..0a4a006 100644 --- a/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r +++ b/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r @@ -1,3 +1,32 @@ +bootstrap_population_transformation_context <- function( + root_path = "~/workspace", + required_packages = c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate", "rlang"), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + population_data_path <- file.path(root_path, "data", "dhis2", "population_transformed") + dir.create(population_data_path, recursive = TRUE, showWarnings = FALSE) + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + assign("openhexa", openhexa, envir = .GlobalEnv) + + list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + POPULATION_DATA_PATH = population_data_path, + openhexa = openhexa + ) +} + get_total_population_reference <- function(config_json, adjust_with_untotals = FALSE) { if (!adjust_with_untotals) { return(NULL) diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index ac7cce0..8f46501 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -1,1076 +1,1068 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2026-01-16T10:22:53.011120", - "exception": false, - "start_time": "2026-01-16T10:22:53.010947", - "status": "completed" + "cells": [ + { + "cell_type": "markdown", + "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2026-01-16T10:22:53.011120", + "exception": false, + "start_time": "2026-01-16T10:22:53.010947", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Data Element reporting rate: based on reporting of one or more indicators\n", + "Partially following methods by WHO and as per Diallo (2025) paper\n", + "\n", + "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", + "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", + "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", + "\n", + "Specifically: \n", + "\n", + "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", + " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", + "
\n", + "
\n", + "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", + " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", + " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", + " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", + "
\n", + "
\n", + "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", + " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", + " * Filename: `XXX_reporting_rate_dataelement.`" + ] }, - "tags": [] - }, - "source": [ - "# Data Element reporting rate: based on reporting of one or more indicators\n", - "Partially following methods by WHO and as per Diallo (2025) paper\n", - "\n", - "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", - "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", - "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", - "\n", - "Specifically: \n", - "\n", - "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", - " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", - "
\n", - "
\n", - "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", - " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", - " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", - " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", - "
\n", - "
\n", - "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", - " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", - " * Filename: `XXX_reporting_rate_dataelement.`" - ] - }, - { - "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", - "metadata": { - "papermill": { - "duration": 0.000228, - "end_time": "2026-01-16T10:22:53.014752", - "exception": false, - "start_time": "2026-01-16T10:22:53.014524", - "status": "completed" + { + "cell_type": "markdown", + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", + "metadata": { + "papermill": { + "duration": 0.000228, + "end_time": "2026-01-16T10:22:53.014752", + "exception": false, + "start_time": "2026-01-16T10:22:53.014524", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 1. Setup" + ] }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", - "metadata": { - "papermill": { - "duration": 63.150489, - "end_time": "2026-01-16T10:23:56.165530", - "exception": false, - "start_time": "2026-01-16T10:22:53.015041", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "35ede7cf-257f-439c-a514-26a7290f881d", + "metadata": { + "papermill": { + "duration": 63.150489, + "end_time": "2026-01-16T10:23:56.165530", + "exception": false, + "start_time": "2026-01-16T10:22:53.015041", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Project paths\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\")\n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", + "\n", + "# Load libraries\n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\", \"zoo\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Environment variables\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load OpenHEXA sdk\n", + "openhexa <- import(\"openhexa.sdk\")\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\")\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", - "\n", - "# Load libraries\n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", - "metadata": { - "papermill": { - "duration": 0.00011, - "end_time": "2026-01-16T10:23:56.165873", - "exception": false, - "start_time": "2026-01-16T10:23:56.165763", - "status": "completed" + { + "cell_type": "markdown", + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", + "metadata": { + "papermill": { + "duration": 0.00011, + "end_time": "2026-01-16T10:23:56.165873", + "exception": false, + "start_time": "2026-01-16T10:23:56.165763", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 1.1. Fallback parameters values\n", + "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", + "The code cell below here provides fallback paramater values needed when running this notebook locally." + ] }, - "tags": [] - }, - "source": [ - "### 1.1. Fallback parameters values\n", - "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", - "The code cell below here provides fallback paramater values needed when running this notebook locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", - "metadata": { - "papermill": { - "duration": 0.033954, - "end_time": "2026-01-16T10:23:56.199937", - "exception": false, - "start_time": "2026-01-16T10:23:56.165983", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", + "metadata": { + "papermill": { + "duration": 0.033954, + "end_time": "2026-01-16T10:23:56.199937", + "exception": false, + "start_time": "2026-01-16T10:23:56.165983", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Current options: \n", + "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", + "# \"COUNTRY_CODE_routine_outliers_removed.parquet\" \n", + "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", + "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"XXX_routine_outliers_imputed.parquet\"}\n", + "\n", + "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", + "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", + "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", + "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", + "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Current options: \n", - "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", - "# \"COUNTRY_CODE_routine_outliers_removed.parquet\" \n", - "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", - "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"XXX_routine_outliers_imputed.parquet\"}\n", - "\n", - "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", - "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", - "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", - "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" - ] - }, - { - "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", - "metadata": { - "papermill": { - "duration": 9.5e-05, - "end_time": "2026-01-16T10:23:56.200231", - "exception": false, - "start_time": "2026-01-16T10:23:56.200136", - "status": "completed" + { + "cell_type": "markdown", + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", + "metadata": { + "papermill": { + "duration": 0.000095, + "end_time": "2026-01-16T10:23:56.200231", + "exception": false, + "start_time": "2026-01-16T10:23:56.200136", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 1.2. Load and check `snt config` file" + ] }, - "tags": [] - }, - "source": [ - "### 1.2. Load and check `snt config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", - "metadata": { - "papermill": { - "duration": 0.521572, - "end_time": "2026-01-16T10:23:56.721932", - "exception": false, - "start_time": "2026-01-16T10:23:56.200360", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", + "metadata": { + "papermill": { + "duration": 0.521572, + "end_time": "2026-01-16T10:23:56.721932", + "exception": false, + "start_time": "2026-01-16T10:23:56.200360", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", - "metadata": { - "papermill": { - "duration": 0.033003, - "end_time": "2026-01-16T10:23:56.755117", - "exception": false, - "start_time": "2026-01-16T10:23:56.722114", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", + "metadata": { + "papermill": { + "duration": 0.033003, + "end_time": "2026-01-16T10:23:56.755117", + "exception": false, + "start_time": "2026-01-16T10:23:56.722114", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", + "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", + "\n", + "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", + "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", + "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", - "# 🚨 NOTE (2025-01-09): The configuration field `NA_TREATMENT` has been removed from SNT_config.json files.\n", - "# It was legacy code from Ousmane and was only used for Reporting Rate calculations (not anymore).\n", - "# It has been replaced by `0_VALUES_PRESERVED` (boolean: true/false) which specifies whether zero values\n", - "# are stored in the DHIS2 instance (true) or converted to NULL to save space (false).\n", - "# See: https://bluesquare.atlassian.net/browse/SNT25-158\n", - "# The variable `NA_TREATMENT` is kept here for backward compatibility but is no longer loaded from config.\n", - "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", - "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) \n", - "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", - "\n", - "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", - "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", - "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ] - }, - { - "cell_type": "markdown", - "id": "8bf4a8bb", - "metadata": {}, - "source": [ - "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18b40207", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!length(ACTIVITY_INDICATORS) > 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", - "metadata": { - "papermill": { - "duration": 9.3e-05, - "end_time": "2026-01-16T10:23:56.779812", - "exception": false, - "start_time": "2026-01-16T10:23:56.779719", - "status": "completed" + { + "cell_type": "markdown", + "id": "8bf4a8bb", + "metadata": {}, + "source": [ + "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", + "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", + "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." + ] }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", - "metadata": { - "papermill": { - "duration": 6.9e-05, - "end_time": "2026-01-16T10:23:56.779987", - "exception": false, - "start_time": "2026-01-16T10:23:56.779918", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "18b40207", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!length(ACTIVITY_INDICATORS) > 0) {\n", + " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", + " cat(msg) \n", + " stop(msg)\n", + "}" + ] }, - "tags": [] - }, - "source": [ - "### 2.1. Routine data (DHIS2) \n", - "**Note on pipeline behaviour**:
\n", - "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", - "metadata": { - "papermill": { - "duration": 2.018878, - "end_time": "2026-01-16T10:23:58.798963", - "exception": false, - "start_time": "2026-01-16T10:23:56.780085", - "status": "completed" + { + "cell_type": "markdown", + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", + "metadata": { + "papermill": { + "duration": 0.000093, + "end_time": "2026-01-16T10:23:56.779812", + "exception": false, + "start_time": "2026-01-16T10:23:56.779719", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 2. Load Data" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "rountine_dataset_name <- select_routine_dataset_name_dataelement(ROUTINE_FILE, COUNTRY_CODE, config_json)\n", - "dhis2_routine <- load_routine_data_dataelement(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE)\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)\n" - ] - }, - { - "cell_type": "markdown", - "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", - "metadata": { - "papermill": { - "duration": 0.000138, - "end_time": "2026-01-16T10:23:58.799287", - "exception": false, - "start_time": "2026-01-16T10:23:58.799149", - "status": "completed" + { + "cell_type": "markdown", + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", + "metadata": { + "papermill": { + "duration": 0.000069, + "end_time": "2026-01-16T10:23:56.779987", + "exception": false, + "start_time": "2026-01-16T10:23:56.779918", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.1. Routine data (DHIS2) \n", + "**Note on pipeline behaviour**:
\n", + "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." + ] }, - "tags": [] - }, - "source": [ - "### 2.2. Organisation units (DHIS2 pyramid)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fd92901-901e-4019-be78-a7718050c1c4", - "metadata": { - "papermill": { - "duration": 0.992899, - "end_time": "2026-01-16T10:23:59.792385", - "exception": false, - "start_time": "2026-01-16T10:23:58.799486", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "a1213723-f7e2-4238-9f37-f1795b187232", + "metadata": { + "papermill": { + "duration": 2.018878, + "end_time": "2026-01-16T10:23:58.798963", + "exception": false, + "start_time": "2026-01-16T10:23:56.780085", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "rountine_dataset_name <- select_routine_dataset_name_dataelement(ROUTINE_FILE, COUNTRY_CODE, config_json)\n", + "dhis2_routine <- load_routine_data_dataelement(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE)\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 2)\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhis2_pyramid_formatted <- load_pyramid_data_dataelement(config_json, COUNTRY_CODE)\n", - "dim(dhis2_pyramid_formatted)\n", - "head(dhis2_pyramid_formatted, 2)\n" - ] - }, - { - "cell_type": "markdown", - "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", - "metadata": { - "papermill": { - "duration": 0.000106, - "end_time": "2026-01-16T10:23:59.792710", - "exception": false, - "start_time": "2026-01-16T10:23:59.792604", - "status": "completed" + { + "cell_type": "markdown", + "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", + "metadata": { + "papermill": { + "duration": 0.000138, + "end_time": "2026-01-16T10:23:58.799287", + "exception": false, + "start_time": "2026-01-16T10:23:58.799149", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.2. Organisation units (DHIS2 pyramid)" + ] }, - "tags": [] - }, - "source": [ - "### 2.3. Check whether selected indicators are present in routine data\n", - "Extra precaution measure to avoid breaks downstream.
\n", - "\n", - "Note: This logic should be moved to pipeline.py 🐍" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", - "metadata": { - "papermill": { - "duration": 0.024863, - "end_time": "2026-01-16T10:23:59.817677", - "exception": false, - "start_time": "2026-01-16T10:23:59.792814", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "2fd92901-901e-4019-be78-a7718050c1c4", + "metadata": { + "papermill": { + "duration": 0.992899, + "end_time": "2026-01-16T10:23:59.792385", + "exception": false, + "start_time": "2026-01-16T10:23:58.799486", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhis2_pyramid_formatted <- load_pyramid_data_dataelement(config_json, COUNTRY_CODE)\n", + "dim(dhis2_pyramid_formatted)\n", + "head(dhis2_pyramid_formatted, 2)\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", - "}\n", - "\n", - "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", - " cat(msg)\n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", - "metadata": { - "papermill": { - "duration": 9.1e-05, - "end_time": "2026-01-16T10:23:59.817949", - "exception": false, - "start_time": "2026-01-16T10:23:59.817858", - "status": "completed" + { + "cell_type": "markdown", + "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", + "metadata": { + "papermill": { + "duration": 0.000106, + "end_time": "2026-01-16T10:23:59.792710", + "exception": false, + "start_time": "2026-01-16T10:23:59.792604", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.3. Check whether selected indicators are present in routine data\n", + "Extra precaution measure to avoid breaks downstream.
\n", + "\n", + "Note: This logic should be moved to pipeline.py 🐍" + ] }, - "tags": [] - }, - "source": [ - "## 3. Reporting rates computations" - ] - }, - { - "cell_type": "markdown", - "id": "7d62cdb6", - "metadata": {}, - "source": [ - "#### 3.0. Define start and end period based on routine data " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", - "metadata": { - "papermill": { - "duration": 0.044172, - "end_time": "2026-01-16T10:23:59.862224", - "exception": false, - "start_time": "2026-01-16T10:23:59.818052", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", + "metadata": { + "papermill": { + "duration": 0.024863, + "end_time": "2026-01-16T10:23:59.817677", + "exception": false, + "start_time": "2026-01-16T10:23:59.792814", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", + "}\n", + "\n", + "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", + " cat(msg)\n", + " stop(msg)\n", + "}" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", - "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", - "\n", - "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", - "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:23:59.862555", - "exception": false, - "start_time": "2026-01-16T10:23:59.862446", - "status": "completed" + { + "cell_type": "markdown", + "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", + "metadata": { + "papermill": { + "duration": 0.000091, + "end_time": "2026-01-16T10:23:59.817949", + "exception": false, + "start_time": "2026-01-16T10:23:59.817858", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 3. Reporting rates computations" + ] }, - "tags": [] - }, - "source": [ - "#### 3.1. Build master table (all PERIOD x OU)\n", - "The master table contains all combinations of period x organisation unit " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9308197a-0852-4d34-8888-cf5564f35a9d", - "metadata": { - "papermill": { - "duration": 0.289128, - "end_time": "2026-01-16T10:24:00.151791", - "exception": false, - "start_time": "2026-01-16T10:23:59.862663", - "status": "completed" + { + "cell_type": "markdown", + "id": "7d62cdb6", + "metadata": {}, + "source": [ + "#### 3.0. Define start and end period based on routine data " + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", - "facility_master <- build_facility_master_dataelement(\n", - " dhis2_pyramid_formatted = dhis2_pyramid_formatted,\n", - " period_vector = period_vector,\n", - " config_json = config_json,\n", - " ADMIN_1 = ADMIN_1,\n", - " ADMIN_2 = ADMIN_2\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", - "metadata": { - "papermill": { - "duration": 0.000114, - "end_time": "2026-01-16T10:24:00.152094", - "exception": false, - "start_time": "2026-01-16T10:24:00.151980", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", + "metadata": { + "papermill": { + "duration": 0.044172, + "end_time": "2026-01-16T10:23:59.862224", + "exception": false, + "start_time": "2026-01-16T10:23:59.818052", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", + "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", + "\n", + "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", + "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" + ] }, - "tags": [] - }, - "source": [ - "#### 3.2. Identify \"Active\" facilities\n", - "\n", - "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b279d27", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "reporting_rate_dataelement <- compute_reporting_rate_dataelement(\n", - " facility_master = facility_master,\n", - " dhis2_routine = dhis2_routine,\n", - " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", - " ACTIVITY_INDICATORS = ACTIVITY_INDICATORS,\n", - " VOLUME_ACTIVITY_INDICATORS = VOLUME_ACTIVITY_INDICATORS,\n", - " DATAELEMENT_METHOD_DENOMINATOR = DATAELEMENT_METHOD_DENOMINATOR,\n", - " USE_WEIGHTED_REPORTING_RATES = USE_WEIGHTED_REPORTING_RATES\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", - "metadata": { - "papermill": { - "duration": 0.000107, - "end_time": "2026-01-16T10:24:01.626760", - "exception": false, - "start_time": "2026-01-16T10:24:01.626653", - "status": "completed" + { + "cell_type": "markdown", + "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:23:59.862555", + "exception": false, + "start_time": "2026-01-16T10:23:59.862446", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.1. Build master table (all PERIOD x OU)\n", + "The master table contains all combinations of period x organisation unit " + ] }, - "tags": [] - }, - "source": [ - "#### 3.3. Identify `OPEN` facilities (denominator)\n", - "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", - "\n", - "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", - "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", - "\n", - "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", - "\n", - "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", - "metadata": { - "papermill": { - "duration": 1.317923, - "end_time": "2026-01-16T10:24:02.944800", - "exception": false, - "start_time": "2026-01-16T10:24:01.626877", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "9308197a-0852-4d34-8888-cf5564f35a9d", + "metadata": { + "papermill": { + "duration": 0.289128, + "end_time": "2026-01-16T10:24:00.151791", + "exception": false, + "start_time": "2026-01-16T10:23:59.862663", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", + "facility_master <- build_facility_master_dataelement(\n", + " dhis2_pyramid_formatted = dhis2_pyramid_formatted,\n", + " period_vector = period_vector,\n", + " config_json = config_json,\n", + " ADMIN_1 = ADMIN_1,\n", + " ADMIN_2 = ADMIN_2\n", + ")\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] - }, - { - "cell_type": "markdown", - "id": "657fd6ca", - "metadata": {}, - "source": [ - "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" - ] - }, - { - "cell_type": "markdown", - "id": "a598e4b7", - "metadata": {}, - "source": [ - "
\n", - " Important: this step could have a huge influence on reporting rates!
\n", - " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", - "
    \n", - "
  • With YEAR → “active that year”
  • \n", - "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", - "
\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", - "metadata": { - "papermill": { - "duration": 0.173961, - "end_time": "2026-01-16T10:24:05.948136", - "exception": false, - "start_time": "2026-01-16T10:24:05.774175", - "status": "completed" + { + "cell_type": "markdown", + "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", + "metadata": { + "papermill": { + "duration": 0.000114, + "end_time": "2026-01-16T10:24:00.152094", + "exception": false, + "start_time": "2026-01-16T10:24:00.151980", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.2. Identify \"Active\" facilities\n", + "\n", + "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] - }, - { - "cell_type": "markdown", - "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", - "metadata": { - "papermill": { - "duration": 9.8e-05, - "end_time": "2026-01-16T10:24:05.948452", - "exception": false, - "start_time": "2026-01-16T10:24:05.948354", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "7b279d27", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "reporting_rate_dataelement <- compute_reporting_rate_dataelement(\n", + " facility_master = facility_master,\n", + " dhis2_routine = dhis2_routine,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " ACTIVITY_INDICATORS = ACTIVITY_INDICATORS,\n", + " VOLUME_ACTIVITY_INDICATORS = VOLUME_ACTIVITY_INDICATORS,\n", + " DATAELEMENT_METHOD_DENOMINATOR = DATAELEMENT_METHOD_DENOMINATOR,\n", + " USE_WEIGHTED_REPORTING_RATES = USE_WEIGHTED_REPORTING_RATES\n", + ")\n" + ] }, - "tags": [] - }, - "source": [ - "#### 3.5. Compute Weighting factor based on \"volume of activity\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4420e559-4134-4fc3-8950-9972ebede00e", - "metadata": { - "papermill": { - "duration": 0.520673, - "end_time": "2026-01-16T10:24:06.469233", - "exception": false, - "start_time": "2026-01-16T10:24:05.948560", - "status": "completed" + { + "cell_type": "markdown", + "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", + "metadata": { + "papermill": { + "duration": 0.000107, + "end_time": "2026-01-16T10:24:01.626760", + "exception": false, + "start_time": "2026-01-16T10:24:01.626653", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.3. Identify `OPEN` facilities (denominator)\n", + "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", + "\n", + "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", + "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", + "\n", + "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", + "\n", + "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] - }, - { - "cell_type": "markdown", - "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:06.469622", - "exception": false, - "start_time": "2026-01-16T10:24:06.469514", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", + "metadata": { + "papermill": { + "duration": 1.317923, + "end_time": "2026-01-16T10:24:02.944800", + "exception": false, + "start_time": "2026-01-16T10:24:01.626877", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] }, - "tags": [] - }, - "source": [ - "#### 3.6. Compute Weighted variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", - "metadata": { - "papermill": { - "duration": 0.483413, - "end_time": "2026-01-16T10:24:06.953139", - "exception": false, - "start_time": "2026-01-16T10:24:06.469726", - "status": "completed" + { + "cell_type": "markdown", + "id": "657fd6ca", + "metadata": {}, + "source": [ + "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] - }, - { - "cell_type": "markdown", - "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2026-01-16T10:24:06.953755", - "exception": false, - "start_time": "2026-01-16T10:24:06.953583", - "status": "completed" + { + "cell_type": "markdown", + "id": "a598e4b7", + "metadata": {}, + "source": [ + "
\n", + " Important: this step could have a huge influence on reporting rates!
\n", + " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", + "
    \n", + "
  • With YEAR → “active that year”
  • \n", + "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", + "
\n", + "
" + ] }, - "tags": [] - }, - "source": [ - "#### 3.7. Aggregate data at ADM2 level" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af13191e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] - }, - { - "cell_type": "markdown", - "id": "7d381937", - "metadata": {}, - "source": [ - "#### 3.8. Calculate Reporting Rates (all methods)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b41263f8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] - }, - { - "cell_type": "markdown", - "id": "5e593659", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:07.310579", - "exception": false, - "start_time": "2026-01-16T10:24:07.310471", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", + "metadata": { + "papermill": { + "duration": 0.173961, + "end_time": "2026-01-16T10:24:05.948136", + "exception": false, + "start_time": "2026-01-16T10:24:05.774175", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] }, - "tags": [] - }, - "source": [ - "## 4. Select correct col for `REPORTING_RATE` based on denominator method" - ] - }, - { - "cell_type": "markdown", - "id": "c75f2249", - "metadata": { - "papermill": { - "duration": 5.7e-05, - "end_time": "2026-01-16T10:24:07.310743", - "exception": false, - "start_time": "2026-01-16T10:24:07.310686", - "status": "completed" + { + "cell_type": "markdown", + "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", + "metadata": { + "papermill": { + "duration": 0.000098, + "end_time": "2026-01-16T10:24:05.948452", + "exception": false, + "start_time": "2026-01-16T10:24:05.948354", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.5. Compute Weighting factor based on \"volume of activity\"" + ] }, - "tags": [] - }, - "source": [ - "### 4.1. Select results and format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75e71b38", - "metadata": { - "papermill": { - "duration": 0.020644, - "end_time": "2026-01-16T10:24:07.351317", - "exception": false, - "start_time": "2026-01-16T10:24:07.330673", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "4420e559-4134-4fc3-8950-9972ebede00e", + "metadata": { + "papermill": { + "duration": 0.520673, + "end_time": "2026-01-16T10:24:06.469233", + "exception": false, + "start_time": "2026-01-16T10:24:05.948560", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3df36abb", - "metadata": { - "papermill": { - "duration": 0.140976, - "end_time": "2026-01-16T10:24:07.492479", - "exception": false, - "start_time": "2026-01-16T10:24:07.351503", - "status": "completed" + { + "cell_type": "markdown", + "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:06.469622", + "exception": false, + "start_time": "2026-01-16T10:24:06.469514", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.6. Compute Weighted variables" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ccc272c", - "metadata": { - "papermill": { - "duration": 0.182574, - "end_time": "2026-01-16T10:24:07.675242", - "exception": false, - "start_time": "2026-01-16T10:24:07.492668", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", + "metadata": { + "papermill": { + "duration": 0.483413, + "end_time": "2026-01-16T10:24:06.953139", + "exception": false, + "start_time": "2026-01-16T10:24:06.469726", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] - }, - { - "cell_type": "markdown", - "id": "ca66e785", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:24:07.675637", - "exception": false, - "start_time": "2026-01-16T10:24:07.675528", - "status": "completed" + { + "cell_type": "markdown", + "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2026-01-16T10:24:06.953755", + "exception": false, + "start_time": "2026-01-16T10:24:06.953583", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.7. Aggregate data at ADM2 level" + ] }, - "tags": [] - }, - "source": [ - "## 5. Inspect reporting rate values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31535459", - "metadata": { - "papermill": { - "duration": 0.160299, - "end_time": "2026-01-16T10:24:07.836039", - "exception": false, - "start_time": "2026-01-16T10:24:07.675740", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "af13191e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", - "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", - "xlab=\"REPORTING_RATE\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6778f17d", - "metadata": { - "papermill": { - "duration": 0.896382, - "end_time": "2026-01-16T10:24:08.732660", - "exception": false, - "start_time": "2026-01-16T10:24:07.836278", - "status": "completed" + { + "cell_type": "markdown", + "id": "7d381937", + "metadata": {}, + "source": [ + "#### 3.8. Calculate Reporting Rates (all methods)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Boxplot\n", - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - " geom_boxplot(outlier.alpha = 0.3) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7f013fd", - "metadata": { - "papermill": { - "duration": 0.859448, - "end_time": "2026-01-16T10:24:09.592295", - "exception": false, - "start_time": "2026-01-16T10:24:08.732847", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "b41263f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "5e593659", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:07.310579", + "exception": false, + "start_time": "2026-01-16T10:24:07.310471", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 4. Select correct col for `REPORTING_RATE` based on denominator method" + ] }, - "tags": [], - "vscode": { - "languageId": "r" + { + "cell_type": "markdown", + "id": "c75f2249", + "metadata": { + "papermill": { + "duration": 0.000057, + "end_time": "2026-01-16T10:24:07.310743", + "exception": false, + "start_time": "2026-01-16T10:24:07.310686", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 4.1. Select results and format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75e71b38", + "metadata": { + "papermill": { + "duration": 0.020644, + "end_time": "2026-01-16T10:24:07.351317", + "exception": false, + "start_time": "2026-01-16T10:24:07.330673", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3df36abb", + "metadata": { + "papermill": { + "duration": 0.140976, + "end_time": "2026-01-16T10:24:07.492479", + "exception": false, + "start_time": "2026-01-16T10:24:07.351503", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ccc272c", + "metadata": { + "papermill": { + "duration": 0.182574, + "end_time": "2026-01-16T10:24:07.675242", + "exception": false, + "start_time": "2026-01-16T10:24:07.492668", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ca66e785", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:24:07.675637", + "exception": false, + "start_time": "2026-01-16T10:24:07.675528", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 5. Inspect reporting rate values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31535459", + "metadata": { + "papermill": { + "duration": 0.160299, + "end_time": "2026-01-16T10:24:07.836039", + "exception": false, + "start_time": "2026-01-16T10:24:07.675740", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", + "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", + "xlab=\"REPORTING_RATE\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6778f17d", + "metadata": { + "papermill": { + "duration": 0.896382, + "end_time": "2026-01-16T10:24:08.732660", + "exception": false, + "start_time": "2026-01-16T10:24:07.836278", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Boxplot\n", + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + " geom_boxplot(outlier.alpha = 0.3) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7f013fd", + "metadata": { + "papermill": { + "duration": 0.859448, + "end_time": "2026-01-16T10:24:09.592295", + "exception": false, + "start_time": "2026-01-16T10:24:08.732847", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + "# Boxplot without outliers\n", + " geom_boxplot(outlier.alpha = 0) +\n", + " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "markdown", + "id": "2866816a-7015-4c5c-b904-f553f3b4790d", + "metadata": { + "papermill": { + "duration": 0.000088, + "end_time": "2026-01-16T10:24:09.592563", + "exception": false, + "start_time": "2026-01-16T10:24:09.592475", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 5. 📁 Export to `data/` folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", + "metadata": { + "papermill": { + "duration": 0.919937, + "end_time": "2026-01-16T10:24:10.512602", + "exception": false, + "start_time": "2026-01-16T10:24:09.592665", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "export_reporting_rate_dataelement(\n", + " reporting_rate_dataelement = reporting_rate_dataelement,\n", + " DATA_PATH = DATA_PATH,\n", + " COUNTRY_CODE = COUNTRY_CODE\n", + ")\n" + ] } - }, - "outputs": [], - "source": [ - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - "# Boxplot without outliers\n", - " geom_boxplot(outlier.alpha = 0) +\n", - " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "markdown", - "id": "2866816a-7015-4c5c-b904-f553f3b4790d", - "metadata": { - "papermill": { - "duration": 8.8e-05, - "end_time": "2026-01-16T10:24:09.592563", - "exception": false, - "start_time": "2026-01-16T10:24:09.592475", - "status": "completed" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "tags": [] - }, - "source": [ - "## 5. 📁 Export to `data/` folder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", - "metadata": { - "papermill": { - "duration": 0.919937, - "end_time": "2026-01-16T10:24:10.512602", - "exception": false, - "start_time": "2026-01-16T10:24:09.592665", - "status": "completed" + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, - "tags": [], - "vscode": { - "languageId": "r" + "papermill": { + "default_parameters": {}, + "duration": 81.158347, + "end_time": "2026-01-16T10:24:10.736106", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", + "parameters": { + "AVAILABILITY_INDICATORS": [ + "CONF", + "PRES", + "SUSP", + "TEST" + ], + "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", + "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace", + "USE_WEIGHTED_REPORTING_RATES": true, + "VOLUME_ACTIVITY_INDICATORS": [ + "CONF", + "PRES" + ] + }, + "start_time": "2026-01-16T10:22:49.577759", + "version": "2.6.0" } - }, - "outputs": [], - "source": [ - "export_reporting_rate_dataelement(\n", - " reporting_rate_dataelement = reporting_rate_dataelement,\n", - " DATA_PATH = DATA_PATH,\n", - " COUNTRY_CODE = COUNTRY_CODE\n", - ")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" }, - "papermill": { - "default_parameters": {}, - "duration": 81.158347, - "end_time": "2026-01-16T10:24:10.736106", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", - "parameters": { - "AVAILABILITY_INDICATORS": [ - "CONF", - "PRES", - "SUSP", - "TEST" - ], - "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", - "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace", - "USE_WEIGHTED_REPORTING_RATES": true, - "VOLUME_ACTIVITY_INDICATORS": [ - "CONF", - "PRES" - ] - }, - "start_time": "2026-01-16T10:22:49.577759", - "version": "2.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb index f2fdb2e..82cb250 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb @@ -147,12 +147,7 @@ "source": [ "# Paths\n", "ROOT_PATH <- '~/workspace'\n", - "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'bednets')" + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')" ] }, { @@ -167,14 +162,15 @@ "outputs": [], "source": [ "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_indicator_tables.r\"))\n", "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" + "setup_ctx <- bootstrap_dhs_indicators_context(root_path = ROOT_PATH)\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "DHS_DATA_PATH <- setup_ctx$DHS_DATA_PATH\n", + "config_json <- setup_ctx$config_json\n", + "COUNTRY_CODE <- setup_ctx$COUNTRY_CODE\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'bednets')\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)" ] }, { @@ -188,24 +184,7 @@ }, "outputs": [], "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" + "reticulate::py_config()$python" ] }, { diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb index aa7435a..bcc6013 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb @@ -83,29 +83,21 @@ }, "outputs": [], "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", "# Paths\n", "ROOT_PATH <- '~/workspace'\n", "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')\n", "\n", "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_indicator_tables.r\"))\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_careseeking_computation.r\"))\n", "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" + "setup_ctx <- bootstrap_dhs_indicators_context(root_path = ROOT_PATH)\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "DHS_DATA_PATH <- setup_ctx$DHS_DATA_PATH\n", + "config_json <- setup_ctx$config_json\n", + "COUNTRY_CODE <- setup_ctx$COUNTRY_CODE\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)" ] }, { @@ -119,24 +111,7 @@ }, "outputs": [], "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" + "reticulate::py_config()$python" ] }, { diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb index 9c5a000..fe9a51d 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb @@ -1,368 +1,375 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "5ce00012-556c-45dc-a572-a38b2205e5a8", - "metadata": {}, - "source": [ - "# Under-five mortality (DHS data)" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "5ce00012-556c-45dc-a572-a38b2205e5a8", + "metadata": {}, + "source": [ + "# Under-five mortality (DHS data)" + ] + }, + { + "cell_type": "markdown", + "id": "e455838e-0ba0-475f-8860-07fd0705fdae", + "metadata": {}, + "source": [ + "## Resources" + ] + }, + { + "cell_type": "markdown", + "id": "ca3f5f16-0df0-432d-9363-e4fcb4ab195b", + "metadata": {}, + "source": [ + "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Early_Childhood_Mortality.htm\n", + "\n", + "Under-5 Mortality Rate (U5MR)\n", + "The under-5 mortality rate is the probability (expressed as a rate per 1,000 live births) of a child exposed in a specific period dying before reaching the age of five years.\n", + "\n", + "\n", + "Coverage:\n", + "Population base: Live births to respondents (BR file)\n", + "\n", + "Time period: Five-year or ten-year periods of time preceding the survey (v008-1 to v008-60 or v008-120 months), excluding the month of interview\n", + "\n", + "Numerators:\n", + "Number of deaths to live-born children during specified age range and specified time period\n", + "Under-5 mortality: Deaths at ages 0 to 4 years, including deaths reported at ages 0 to 59 months and 0 to 99 days\n", + "\n", + "Denominator: Number of surviving children at beginning of specified age range during the specified time period\n", + "\n", + "Variables: BR file.\n", + "\n", + "b3 Date of birth of child (CMC)\n", + "\n", + "b5 Child is alive (1 = Yes, 0 = No)\n", + "\n", + "b7 Age at death in months (imputed)\n", + "\n", + "v008 Date of interview (CMC)\n", + "\n", + "v005 Woman’s individual sample weight" + ] + }, + { + "cell_type": "markdown", + "id": "a2f57835-fe83-44fe-b9a3-a0e21381c21b", + "metadata": {}, + "source": [ + "## Preliminary steps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc050b96-7cd0-4eba-aa9b-0744e531dd4c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)\n", + "\n", + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_indicator_tables.r\"))\n", + "setup_ctx <- bootstrap_dhs_indicators_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\", \"DHS.rates\")\n", + ")\n", + "\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "DHS_DATA_PATH <- setup_ctx$DHS_DATA_PATH\n", + "config_json <- setup_ctx$config_json\n", + "COUNTRY_CODE <- setup_ctx$COUNTRY_CODE\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'mortality')\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "\n", + "reticulate::py_config()$python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91cd32a3-7ec7-467d-917c-2a2fbd0f13a1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ] + }, + { + "cell_type": "markdown", + "id": "1f0a0363-d31a-4667-bad2-f3aca94c57ed", + "metadata": {}, + "source": [ + "## Geo/admin data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca05f908-4c8a-4fd7-832c-6d52a7da83e8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}\n", + "\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)" + ] + }, + { + "cell_type": "markdown", + "id": "1e7a06e8-fe84-4a75-aac8-316abc36e7ce", + "metadata": {}, + "source": [ + "## Import DHS data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ec3b998-25a8-455b-8b61-4201d300888e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "indicator_u5mr <- 'U5MR_PERMIL' # to be computed\n", + "\n", + "data_source <- 'DHS'\n", + "household_recode <- 'HR'\n", + "births_recode <- 'BR'\n", + "target_file_type <- 'SV'\n", + "\n", + "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", + "\n", + "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "dhs_br_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, births_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_br_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "# # Remove existing output files\n", + "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "# files_to_delete <- files[grepl('U5_MORT', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", + "# file.remove(files_to_delete)\n", + "\n", + "data_extension <- '.SAV'\n", + "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "dhs_br_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", births_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "\n", + "if(!check_dhs_same_version(dhs_hr_filename, dhs_br_filename)){\n", + " stop(\"The input DHS data do not have the same version/issue. Check available data before rerunning.\")\n", + "}\n", + "\n", + "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", + "dhs_hr_dt <- setDT(dhs_hr_dt)\n", + "\n", + "dhs_br_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_br_filename)) # births recode\n", + "dhs_br_dt <- setDT(dhs_br_dt)\n", + "\n", + "# Make admin codes and names dataframe (for future merging)\n", + "\n", + "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", + "\n", + "dhs_admin_dt <- make_dhs_admin_df(\n", + " input_dhs_df=dhs_hr_dt,\n", + " original_admin_column=\"HV024\",\n", + " new_admin_name_colname=admin_name_col,\n", + " new_admin_code_colname='DHS_ADM1_CODE'\n", + ")\n", + "\n", + "# format the names to be like DHIS2 names\n", + "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", + "\n", + "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", + "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", + "\n", + "# Check that all regions can be matched with DHIS2 pyramid\n", + "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", + " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", + "}\n", + "\n", + "rm(dhs_hr_dt) # free up resources" + ] + }, + { + "cell_type": "markdown", + "id": "71af8f28-3a2d-4ae2-812c-7cec9cd89288", + "metadata": {}, + "source": [ + "## Preprocess DHS data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e560e51b-369e-4365-868b-1285a5522f7c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Relevant columns\n", + "household_id_cols <- c(\"CASEID\", \"V000\", \"V001\", \"V002\")\n", + "household_sampling_cols <- c(\"V005\", \"V021\", \"V022\", \"V023\", \"V024\")\n", + "birth_date_col <- \"B3\" # Date of birth of child (CMC)\n", + "alive_col <- \"B5\" # Child is alive (1 = Yes, 0 = No)\n", + "death_age_col <-\"B7\" # Age at death in months (imputed)\n", + "end_date_col <- \"V008\" # Date of interview (CMC)\n", + "\n", + "dhs_br_dt[, (birth_date_col) := as.integer(get(birth_date_col))]\n", + "dhs_br_dt[, (death_age_col) := as.integer(get(death_age_col))]\n", + "dhs_br_dt[, (end_date_col) := as.integer(get(end_date_col))]\n", + "\n", + "dhs_br_dt <- dhs_br_dt[\n", + " ,\n", + " .SD,\n", + " .SDcols = c(\n", + " household_id_cols,\n", + " household_sampling_cols,\n", + " birth_date_col,\n", + " alive_col,\n", + " death_age_col,\n", + " end_date_col\n", + ")\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "d48831a5-2b1f-4201-9438-e7fca9b8accb", + "metadata": {}, + "source": [ + "## Compute indicator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf8c11a9-e69c-47e2-87df-a693b9e68cd3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "region_dt_list <- split(dhs_br_dt, by = \"V024\")\n", + "\n", + "u5mort_table <- rbindlist(\n", + " lapply(region_dt_list, make_dhs_adm1_u5mort_dt) \n", + ")\n", + "\n", + "lower_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_LOWER_BOUND\")\n", + "upper_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_UPPER_BOUND\")\n", + "sample_avg_col <- glue(\"{toupper(indicator_u5mr)}_SAMPLE_AVERAGE\")\n", + "\n", + "# add necessary missing columns and remove non-necessary present columns\n", + "u5mort_table <- merge.data.table(dhs_admin_dt, u5mort_table, by = 'DHS_ADM1_CODE', all = TRUE)\n", + "setnames(u5mort_table,\n", + " old=c(\"R\", \"LCI\", \"UCI\"),\n", + " new=c(\n", + " sample_avg_col,\n", + " lower_bound_col,\n", + " upper_bound_col\n", + " ),\n", + " skip_absent=TRUE # not changing all names\n", + " )\n", + "u5mort_table <- merge.data.table(admin_data, u5mort_table, by = admin_name_col)\n", + "u5mort_table <- u5mort_table[\n", + " ,\n", + " .SD,\n", + " .SDcols = c(\n", + " admin_cols,\n", + " sample_avg_col,\n", + " lower_bound_col,\n", + " upper_bound_col\n", + " )\n", + " ]\n", + "\n", + "# Cap the CI's at 0 (in case of small numbers)\n", + "u5mort_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", + "\n", + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}\")\n", + "write.csv(u5mort_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", + "write_parquet(u5mort_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25bf0a56-a8bf-45a9-b63f-d0bdbd606fde", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "e455838e-0ba0-475f-8860-07fd0705fdae", - "metadata": {}, - "source": [ - "## Resources" - ] - }, - { - "cell_type": "markdown", - "id": "ca3f5f16-0df0-432d-9363-e4fcb4ab195b", - "metadata": {}, - "source": [ - "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Early_Childhood_Mortality.htm\n", - "\n", - "Under-5 Mortality Rate (U5MR)\n", - "The under-5 mortality rate is the probability (expressed as a rate per 1,000 live births) of a child exposed in a specific period dying before reaching the age of five years.\n", - "\n", - "\n", - "Coverage:\n", - "Population base: Live births to respondents (BR file)\n", - "\n", - "Time period: Five-year or ten-year periods of time preceding the survey (v008-1 to v008-60 or v008-120 months), excluding the month of interview\n", - "\n", - "Numerators:\n", - "Number of deaths to live-born children during specified age range and specified time period\n", - "Under-5 mortality: Deaths at ages 0 to 4 years, including deaths reported at ages 0 to 59 months and 0 to 99 days\n", - "\n", - "Denominator: Number of surviving children at beginning of specified age range during the specified time period\n", - "\n", - "Variables: BR file.\n", - "\n", - "b3 Date of birth of child (CMC)\n", - "\n", - "b5 Child is alive (1 = Yes, 0 = No)\n", - "\n", - "b7 Age at death in months (imputed)\n", - "\n", - "v008 Date of interview (CMC)\n", - "\n", - "v005 Woman’s individual sample weight" - ] - }, - { - "cell_type": "markdown", - "id": "a2f57835-fe83-44fe-b9a3-a0e21381c21b", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fc050b96-7cd0-4eba-aa9b-0744e531dd4c", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)\n", - "\n", - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'mortality')\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\", \"DHS.rates\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91cd32a3-7ec7-467d-917c-2a2fbd0f13a1", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "markdown", - "id": "1f0a0363-d31a-4667-bad2-f3aca94c57ed", - "metadata": {}, - "source": [ - "## Geo/admin data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca05f908-4c8a-4fd7-832c-6d52a7da83e8", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}\n", - "\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)" - ] - }, - { - "cell_type": "markdown", - "id": "1e7a06e8-fe84-4a75-aac8-316abc36e7ce", - "metadata": {}, - "source": [ - "## Import DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ec3b998-25a8-455b-8b61-4201d300888e", - "metadata": {}, - "outputs": [], - "source": [ - "indicator_u5mr <- 'U5MR_PERMIL' # to be computed\n", - "\n", - "data_source <- 'DHS'\n", - "household_recode <- 'HR'\n", - "births_recode <- 'BR'\n", - "target_file_type <- 'SV'\n", - "\n", - "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", - "\n", - "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "dhs_br_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, births_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_br_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "# # Remove existing output files\n", - "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "# files_to_delete <- files[grepl('U5_MORT', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", - "# file.remove(files_to_delete)\n", - "\n", - "data_extension <- '.SAV'\n", - "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "dhs_br_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", births_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "\n", - "if(!check_dhs_same_version(dhs_hr_filename, dhs_br_filename)){\n", - " stop(\"The input DHS data do not have the same version/issue. Check available data before rerunning.\")\n", - "}\n", - "\n", - "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", - "dhs_hr_dt <- setDT(dhs_hr_dt)\n", - "\n", - "dhs_br_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_br_filename)) # births recode\n", - "dhs_br_dt <- setDT(dhs_br_dt)\n", - "\n", - "# Make admin codes and names dataframe (for future merging)\n", - "\n", - "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", - "\n", - "dhs_admin_dt <- make_dhs_admin_df(\n", - " input_dhs_df=dhs_hr_dt,\n", - " original_admin_column=\"HV024\",\n", - " new_admin_name_colname=admin_name_col,\n", - " new_admin_code_colname='DHS_ADM1_CODE'\n", - ")\n", - "\n", - "# format the names to be like DHIS2 names\n", - "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", - "\n", - "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", - "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", - "\n", - "# Check that all regions can be matched with DHIS2 pyramid\n", - "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", - " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", - "}\n", - "\n", - "rm(dhs_hr_dt) # free up resources" - ] - }, - { - "cell_type": "markdown", - "id": "71af8f28-3a2d-4ae2-812c-7cec9cd89288", - "metadata": {}, - "source": [ - "## Preprocess DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e560e51b-369e-4365-868b-1285a5522f7c", - "metadata": {}, - "outputs": [], - "source": [ - "# Relevant columns\n", - "household_id_cols <- c(\"CASEID\", \"V000\", \"V001\", \"V002\")\n", - "household_sampling_cols <- c(\"V005\", \"V021\", \"V022\", \"V023\", \"V024\")\n", - "birth_date_col <- \"B3\" # Date of birth of child (CMC)\n", - "alive_col <- \"B5\" # Child is alive (1 = Yes, 0 = No)\n", - "death_age_col <-\"B7\" # Age at death in months (imputed)\n", - "end_date_col <- \"V008\" # Date of interview (CMC)\n", - "\n", - "dhs_br_dt[, (birth_date_col) := as.integer(get(birth_date_col))]\n", - "dhs_br_dt[, (death_age_col) := as.integer(get(death_age_col))]\n", - "dhs_br_dt[, (end_date_col) := as.integer(get(end_date_col))]\n", - "\n", - "dhs_br_dt <- dhs_br_dt[\n", - " ,\n", - " .SD,\n", - " .SDcols = c(\n", - " household_id_cols,\n", - " household_sampling_cols,\n", - " birth_date_col,\n", - " alive_col,\n", - " death_age_col,\n", - " end_date_col\n", - ")\n", - "]" - ] - }, - { - "cell_type": "markdown", - "id": "d48831a5-2b1f-4201-9438-e7fca9b8accb", - "metadata": {}, - "source": [ - "## Compute indicator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf8c11a9-e69c-47e2-87df-a693b9e68cd3", - "metadata": {}, - "outputs": [], - "source": [ - "region_dt_list <- split(dhs_br_dt, by = \"V024\")\n", - "\n", - "u5mort_table <- rbindlist(\n", - " lapply(region_dt_list, make_dhs_adm1_u5mort_dt) \n", - ")\n", - "\n", - "lower_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_LOWER_BOUND\")\n", - "upper_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_UPPER_BOUND\")\n", - "sample_avg_col <- glue(\"{toupper(indicator_u5mr)}_SAMPLE_AVERAGE\")\n", - "\n", - "# add necessary missing columns and remove non-necessary present columns\n", - "u5mort_table <- merge.data.table(dhs_admin_dt, u5mort_table, by = 'DHS_ADM1_CODE', all = TRUE)\n", - "setnames(u5mort_table,\n", - " old=c(\"R\", \"LCI\", \"UCI\"),\n", - " new=c(\n", - " sample_avg_col,\n", - " lower_bound_col,\n", - " upper_bound_col\n", - " ),\n", - " skip_absent=TRUE # not changing all names\n", - " )\n", - "u5mort_table <- merge.data.table(admin_data, u5mort_table, by = admin_name_col)\n", - "u5mort_table <- u5mort_table[\n", - " ,\n", - " .SD,\n", - " .SDcols = c(\n", - " admin_cols,\n", - " sample_avg_col,\n", - " lower_bound_col,\n", - " upper_bound_col\n", - " )\n", - " ]\n", - "\n", - "# Cap the CI's at 0 (in case of small numbers)\n", - "u5mort_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - "\n", - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}\")\n", - "write.csv(u5mort_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", - "write_parquet(u5mort_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25bf0a56-a8bf-45a9-b63f-d0bdbd606fde", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb index 60d8c5f..53b0429 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb @@ -63,47 +63,22 @@ "\n", "options(scipen=999)\n", "\n", - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", "# Paths\n", "ROOT_PATH <- '~/workspace'\n", "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'prevalence')\n", "\n", "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_indicator_tables.r\"))\n", "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", + "setup_ctx <- bootstrap_dhs_indicators_context(root_path = ROOT_PATH)\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "DHS_DATA_PATH <- setup_ctx$DHS_DATA_PATH\n", + "config_json <- setup_ctx$config_json\n", + "COUNTRY_CODE <- setup_ctx$COUNTRY_CODE\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'prevalence')\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n" + "reticulate::py_config()$python" ] }, { diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb index 4e8d453..9ff3880 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb @@ -61,12 +61,7 @@ "source": [ "# Paths\n", "ROOT_PATH <- '~/workspace'\n", - "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'vaccination')" + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')" ] }, { @@ -81,14 +76,19 @@ "outputs": [], "source": [ "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_indicator_tables.r\"))\n", "\n", - "# List required pcks\n", - "required_packages <- c(\"readr\", \"haven\", \"glue\", \"survey\", \"data.table\", \"sf\", \"ggplot2\", \"stringi\", \"reticulate\", \"jsonlite\", \"httr\", \"arrow\")\n", + "setup_ctx <- bootstrap_dhs_indicators_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"readr\", \"haven\", \"glue\", \"survey\", \"data.table\", \"sf\", \"ggplot2\", \"stringi\", \"reticulate\", \"jsonlite\", \"httr\", \"arrow\")\n", + ")\n", "\n", - "# Execute function\n", - "install_and_load(required_packages)" + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "DHS_DATA_PATH <- setup_ctx$DHS_DATA_PATH\n", + "config_json <- setup_ctx$config_json\n", + "COUNTRY_CODE <- setup_ctx$COUNTRY_CODE\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'vaccination')\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)" ] }, { @@ -102,9 +102,7 @@ }, "outputs": [], "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" + "reticulate::py_config()$python" ] }, { @@ -118,17 +116,7 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)" + "# Configuration already loaded by bootstrap_dhs_indicators_context()." ] }, { diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r index 58d0129..53f523a 100644 --- a/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r @@ -1,3 +1,55 @@ +bootstrap_dhs_indicators_context <- function( + root_path = "~/workspace", + required_packages = c( + "haven", "sf", "glue", "survey", "data.table", "stringi", + "jsonlite", "httr", "reticulate", "arrow" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + dhs_data_path <- file.path(data_path, "dhs", "raw") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + assign("openhexa", openhexa, envir = .GlobalEnv) + + config_file_name <- "SNT_config.json" + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, config_file_name)) + }, + error = function(e) { + msg <- paste0("Error while loading configuration", conditionMessage(e)) + cat(msg) + stop(msg) + } + ) + + log_msg(paste0("SNT configuration loaded from : ", file.path(config_path, config_file_name))) + + list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + DHS_DATA_PATH = dhs_data_path, + config_json = config_json, + COUNTRY_CODE = config_json$SNT_CONFIG$COUNTRY_CODE, + openhexa = openhexa + ) +} + compute_and_export_indicator_table <- function( design_obj, indicator_name, diff --git a/pipelines/snt_healthcare_access/code/snt_healthcare_access.ipynb b/pipelines/snt_healthcare_access/code/snt_healthcare_access.ipynb index 3e61276..bbc2737 100644 --- a/pipelines/snt_healthcare_access/code/snt_healthcare_access.ipynb +++ b/pipelines/snt_healthcare_access/code/snt_healthcare_access.ipynb @@ -56,13 +56,7 @@ "# Project paths\n", "ROOT_PATH <- '~/workspace'\n", "PROJECT_PATH <- file.path(ROOT_PATH, \"pipelines/snt_healthcare_access\")\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "UTILS_PATH <- file.path(PROJECT_PATH, 'utils')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'healthcare_access')\n", - "OUTPUT_PLOTS_PATH <- file.path(PROJECT_PATH, 'reporting', 'outputs')" + "UTILS_PATH <- file.path(PROJECT_PATH, 'utils')" ] }, { @@ -127,19 +121,15 @@ "# Global settings\n", "options(scipen=999)\n", "\n", - "# Load snt utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", "source(file.path(UTILS_PATH, \"snt_healthcare_access.r\"))\n", + "setup_ctx <- bootstrap_healthcare_access_context(root_path = ROOT_PATH)\n", "\n", - "# Required packages # \"geojsonio\", #\"RColorBrewer\",\n", - "required_packages <- c(\"jsonlite\", \"dplyr\", \"data.table\", \"ggplot2\", \"arrow\", \"glue\", \"sf\", \"terra\", \"httr\", \"reticulate\", \"arrow\", \"stringr\")\n", - "install_and_load(required_packages)\n", - "terraOptions(memfrac = 0.5)\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "OUTPUT_DATA_PATH <- setup_ctx$OUTPUT_DATA_PATH\n", + "OUTPUT_PLOTS_PATH <- setup_ctx$OUTPUT_PLOTS_PATH\n", "\n", - "# Openhexa\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", "\n", "# Load SNT config\n", "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", diff --git a/pipelines/snt_healthcare_access/utils/snt_healthcare_access.r b/pipelines/snt_healthcare_access/utils/snt_healthcare_access.r index 7cf1db2..a10aabd 100644 --- a/pipelines/snt_healthcare_access/utils/snt_healthcare_access.r +++ b/pipelines/snt_healthcare_access/utils/snt_healthcare_access.r @@ -1,3 +1,41 @@ +bootstrap_healthcare_access_context <- function( + root_path = "~/workspace", + required_packages = c( + "jsonlite", "dplyr", "data.table", "ggplot2", "arrow", "glue", + "sf", "terra", "httr", "reticulate", "stringr" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + output_data_path <- file.path(data_path, "healthcare_access") + output_plots_path <- file.path(root_path, "pipelines", "snt_healthcare_access", "reporting", "outputs") + dir.create(output_data_path, recursive = TRUE, showWarnings = FALSE) + dir.create(output_plots_path, recursive = TRUE, showWarnings = FALSE) + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + terra::terraOptions(memfrac = 0.5) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + assign("openhexa", openhexa, envir = .GlobalEnv) + + list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + OUTPUT_DATA_PATH = output_data_path, + OUTPUT_PLOTS_PATH = output_plots_path, + openhexa = openhexa + ) +} + load_spatial_units_data <- function(shapes_file, dhis2_dataset, country_code) { if (!is.null(shapes_file) && !is.na(shapes_file) && trimws(shapes_file) != "") { custom_shapes_path <- path.expand(shapes_file) diff --git a/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb b/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb index 1b8afef..96527b2 100644 --- a/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb +++ b/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb @@ -113,12 +113,7 @@ "source": [ "# Paths\n", "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_seasonality_cases')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'seasonality_cases')\n", - "INTERMEDIATE_RESULTS_PATH <- file.path(OUTPUT_DATA_PATH, \"intermediate_results\")" + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_seasonality_cases')" ] }, { @@ -132,9 +127,20 @@ }, "outputs": [], "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"snt_seasonality_cases.r\"))" + "# Load utils and bootstrap context\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_seasonality_cases.r\"))\n", + "\n", + "setup_ctx <- bootstrap_seasonality_cases_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\n", + " \"jsonlite\", \"data.table\", \"ggplot2\", \"fpp3\", \"arrow\", \"glue\",\n", + " \"sf\", \"RColorBrewer\", \"httr\", \"reticulate\"\n", + " )\n", + ")\n", + "\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "OUTPUT_DATA_PATH <- setup_ctx$OUTPUT_DATA_PATH\n", + "INTERMEDIATE_RESULTS_PATH <- setup_ctx$INTERMEDIATE_RESULTS_PATH" ] }, { @@ -148,22 +154,7 @@ }, "outputs": [], "source": [ - "# List required pcks\n", - "required_packages <- c(\n", - " \"jsonlite\",\n", - " \"data.table\",\n", - " \"ggplot2\",\n", - " \"fpp3\",\n", - " \"arrow\",\n", - " \"glue\",\n", - " \"sf\",\n", - " \"RColorBrewer\",\n", - " \"httr\",\n", - " \"reticulate\"\n", - ")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" + "# Packages are loaded in bootstrap_seasonality_cases_context()." ] }, { @@ -182,9 +173,7 @@ }, "outputs": [], "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", "\n", "# Check that compute_month_seasonality() supports the required parameter\n", "if (!(\"use_calendar_year_denominator\" %in% names(formals(compute_month_seasonality)))) {\n", diff --git a/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r b/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r index 1c4e8bf..2ef6035 100644 --- a/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r +++ b/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r @@ -1,3 +1,41 @@ +bootstrap_seasonality_cases_context <- function( + root_path = "~/workspace", + required_packages = c( + "jsonlite", "data.table", "ggplot2", "fpp3", "arrow", "glue", + "sf", "RColorBrewer", "httr", "reticulate" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + output_data_path <- file.path(root_path, "data", "seasonality_cases") + intermediate_results_path <- file.path(output_data_path, "intermediate_results") + dir.create(output_data_path, recursive = TRUE, showWarnings = FALSE) + dir.create(intermediate_results_path, recursive = TRUE, showWarnings = FALSE) + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + assign("openhexa", openhexa, envir = .GlobalEnv) + + list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + OUTPUT_DATA_PATH = output_data_path, + INTERMEDIATE_RESULTS_PATH = intermediate_results_path, + openhexa = openhexa + ) +} + compute_cases_proportion <- function(admin_id, block_duration, row_data, annual_data, admin_col, year_column) { if (is.na(block_duration) || is.infinite(block_duration)) { return(NA_real_) From 905e422437d1c8aa7adff94a404e0a163a87be7e Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 2 Apr 2026 13:45:47 +0200 Subject: [PATCH 23/23] fix + back to previous version for pyramyd et extract --- .../code/NER_pyramid_format.ipynb | 19 +++++++-- .../utils/snt_dhis2_extract.r | 30 -------------- .../code/snt_dhis2_formatting_routine.ipynb | 25 +++-------- .../snt_dhis2_formatting_report.ipynb | 10 ++--- .../utils/snt_dhis2_formatting.r | 32 +++++++++++++++ .../utils/snt_dhis2_formatting_report.r | 22 ++++++++++ .../code/snt_dhis2_incidence.ipynb | 12 +++--- .../utils/snt_dhis2_incidence.r | 25 ++++++++++- .../snt_dhis2_population_transformation.ipynb | 14 ++----- .../snt_dhis2_population_transformation.r | 17 ++++++++ .../code/snt_dhs_bednets_computation.ipynb | 8 ++-- .../snt_dhs_careseeking_computation.ipynb | 8 ++-- .../code/snt_dhs_mortality_computation.ipynb | 8 ++-- .../code/snt_dhs_prevalence_computation.ipynb | 8 ++-- .../snt_dhs_vaccination_computation.ipynb | 8 ++-- .../utils/snt_dhs_indicator_tables.r | 16 ++++++++ .../reporting/snt_map_extracts_report.ipynb | 23 +++++------ .../utils/snt_map_extracts_report.r | 15 +++++++ .../code/snt_seasonality_cases.ipynb | 15 ++++--- .../utils/snt_seasonality_cases.r | 31 ++++++++++++++ .../snt_worldpop_extract_report.ipynb | 41 +++++++------------ .../utils/snt_worldpop_extract_report.r | 15 +++++++ 22 files changed, 259 insertions(+), 143 deletions(-) diff --git a/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb b/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb index d29c52f..1b0ce4b 100644 --- a/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb +++ b/pipelines/snt_dhis2_extract/code/NER_pyramid_format.ipynb @@ -35,11 +35,22 @@ "source": [ "# PROJECT PATHS\n", "if (!exists(\"ROOT_PATH\")) ROOT_PATH <- \"~/workspace\"\n", - "PIPELINE_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_extract\")\n", + "SNT_ROOT_PATH <- ROOT_PATH\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_extract\")\n", "\n", + "# Load snt utils and pipeline helpers\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_extract.r\"))\n", - "setup_ctx <- bootstrap_dhis2_extract_context(root_path = ROOT_PATH)\n", - "SNT_ROOT_PATH <- setup_ctx$ROOT_PATH" + "\n", + "# Load libraries\n", + "required_packages <- c(\"arrow\", \"dplyr\", \"tidyverse\", \"jsonlite\", \"reticulate\", \"glue\", \"sf\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Load openhexa SDK and toolbox\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "openhexa_toolbox <- import(\"openhexa.toolbox\")" ] }, { @@ -221,7 +232,7 @@ "polygons_level3 <- polygons_level3[!is.na(polygons_level3$geometry) & polygons_level3$geometry != \"\", ]\n", "points_level_6 <- group_prioritaires_level_3[!is.na(group_prioritaires_level_3$geometry) & group_prioritaires_level_3$geometry != \"\", ]\n", "\n", - "# Disable S2 (assume planar coordinates)\n", + "# Use S2 geodesic operations\n", "sf_use_s2(TRUE)\n", "\n", "# Convert to sf and validate poligons\n", diff --git a/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r b/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r index 9010dcc..54304b7 100644 --- a/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r +++ b/pipelines/snt_dhis2_extract/utils/snt_dhis2_extract.r @@ -1,35 +1,5 @@ # Shared helpers for snt_dhis2_extract notebooks. -bootstrap_dhis2_extract_context <- function( - root_path = "~/workspace", - required_packages = c("arrow", "dplyr", "tidyverse", "jsonlite", "reticulate", "glue", "sf"), - load_openhexa = TRUE -) { - code_path <- file.path(root_path, "code") - pipeline_path <- file.path(root_path, "pipelines", "snt_dhis2_extract") - - source(file.path(code_path, "snt_utils.r")) - install_and_load(required_packages) - - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - openhexa <- NULL - openhexa_toolbox <- NULL - if (load_openhexa) { - openhexa <- reticulate::import("openhexa.sdk") - openhexa_toolbox <- reticulate::import("openhexa.toolbox") - } - assign("openhexa", openhexa, envir = .GlobalEnv) - assign("openhexa_toolbox", openhexa_toolbox, envir = .GlobalEnv) - - list( - ROOT_PATH = root_path, - CODE_PATH = code_path, - PIPELINE_PATH = pipeline_path, - openhexa = openhexa, - openhexa_toolbox = openhexa_toolbox - ) -} - printdim <- function(df, name = deparse(substitute(df))) { cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") } diff --git a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb index c9c27bb..98a500e 100644 --- a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb +++ b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb @@ -164,16 +164,8 @@ }, "outputs": [], "source": [ - "# CHECK SNT configuration \n", - "snt_config_mandatory <- c(\"COUNTRY_CODE\", \"DHIS2_ADMINISTRATION_1\", \"DHIS2_ADMINISTRATION_2\") #, \"ORG_UNITS_LEVELS_SELECTION\")\n", - "for (conf in snt_config_mandatory) {\n", - " print(paste(conf, \":\", config_json$SNT_CONFIG[conf]))\n", - " if (is.null(config_json$SNT_CONFIG[[conf]])) {\n", - " msg <- paste(\"Missing configuration input:\", conf)\n", - " cat(msg) \n", - " stop(msg)\n", - " }\n", - "}\n", + "# CHECK SNT configuration\n", + "validate_required_snt_config(config_json)\n", "\n", "# Save this country code in a variable\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", @@ -212,15 +204,10 @@ "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", "\n", "# Load file from dataset\n", - "dhis2_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_analytics.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 analytics file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"DHIS2 analytics data loaded from dataset : \", dataset_name, \" dataframe dimensions: \", paste(dim(dhis2_data), collapse=\", \"))\n", - "log_msg(msg)" + "dhis2_data <- load_dhis2_analytics_extract(\n", + " dataset_name = dataset_name,\n", + " country_code = COUNTRY_CODE\n", + ")" ] }, { diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb index b49d1a7..37a117c 100644 --- a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb @@ -219,12 +219,10 @@ "outputs": [], "source": [ "# import analytics DHIS2 data\n", - "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste0(\"[WARNING] Error while loading DHIS2 Routine data for: \" , COUNTRY_CODE, \n", - " \" the report cannot be executed. [ERROR DETAILS] \", conditionMessage(e))\n", - " stop(msg)\n", - " })\n", + "routine_data <- load_routine_data_report(\n", + " dataset_name = dataset_name,\n", + " country_code = COUNTRY_CODE\n", + ")\n", "\n", "printdim(routine_data)" ] diff --git a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r index 26b29e8..fd16b68 100644 --- a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r +++ b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r @@ -52,3 +52,35 @@ build_routine_indicators <- function(routine_data_ind, dhis_indicator_definition empty_data_indicators = empty_data_indicators ) } + +validate_required_snt_config <- function(config_json, required_fields = c("COUNTRY_CODE", "DHIS2_ADMINISTRATION_1", "DHIS2_ADMINISTRATION_2")) { + for (conf in required_fields) { + if (is.null(config_json$SNT_CONFIG[[conf]])) { + msg <- paste("Missing configuration input:", conf) + log_msg(msg, level = "error") + stop(msg) + } + } + invisible(TRUE) +} + +load_dhis2_analytics_extract <- function(dataset_name, country_code) { + dhis2_data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_dhis2_raw_analytics.parquet")) + }, + error = function(e) { + msg <- paste("Error while loading DHIS2 analytics file for:", country_code, conditionMessage(e)) + log_msg(msg, level = "error") + stop(msg) + } + ) + msg <- paste0( + "DHIS2 analytics data loaded from dataset : ", + dataset_name, + " dataframe dimensions: ", + paste(dim(dhis2_data), collapse = ", ") + ) + log_msg(msg) + dhis2_data +} diff --git a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r index ecfabfe..66541c7 100644 --- a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r +++ b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r @@ -4,6 +4,28 @@ printdim <- function(df, name = deparse(substitute(df))) { cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") } +load_routine_data_report <- function(dataset_name, country_code) { + routine_data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_routine.parquet")) + }, + error = function(e) { + msg <- paste0( + "[WARNING] Error while loading DHIS2 Routine data for: ", + country_code, + " the report cannot be executed. [ERROR DETAILS] ", + conditionMessage(e) + ) + stop(msg) + } + ) + + log_msg(glue::glue( + "DHIS2 routine file loaded from dataset: {dataset_name}. Dimensions: {nrow(routine_data)} rows, {ncol(routine_data)} columns." + )) + routine_data +} + detect_mad_outliers <- function(data_long, deviation = 15, outlier_column = "mad_flag") { data_long %>% dplyr::group_by(OU, indicator, YEAR) %>% diff --git a/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb b/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb index a2a4d97..78ba01a 100644 --- a/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb +++ b/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb @@ -225,7 +225,10 @@ }, "outputs": [], "source": [ - "load_dhis2_routine_data()" + "load_dhis2_routine_data(\n", + " required_fixed_cols = fixed_cols,\n", + " required_indicators = DHIS2_INDICATORS\n", + ")" ] }, { @@ -260,12 +263,7 @@ }, "outputs": [], "source": [ - "# `fixed_cols`: Fixed columns that should be always present regardless of the config.\n", - "check_fixed_cols_in_routine()\n", - "\n", - "# `DHIS2_INDICATORS`: Indicators, as defined in the config.json file, \n", - "# are expected to be present if the extraction pipeline and this pipeline are run on the same config settings.\n", - "check_dhis2_indicators_cols_in_routine()" + "# Fixed columns and DHIS2 indicators are validated inside load_dhis2_routine_data()." ] }, { diff --git a/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r b/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r index ec3d180..0c06682 100644 --- a/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r +++ b/pipelines/snt_dhis2_incidence/utils/snt_dhis2_incidence.r @@ -94,7 +94,7 @@ routine_filename <<- paste0(COUNTRY_CODE, routine_name) log_msg(glue("Selected routine dataset: {routine_dataset_name}, filename: {routine_filename}")) } -load_dhis2_routine_data <- function() { +load_dhis2_routine_data <- function(required_fixed_cols = NULL, required_indicators = NULL, cast_year_month = TRUE) { dhis2_routine <<- tryCatch({ get_latest_dataset_file_in_memory(routine_dataset_name, routine_filename) }, error = function(e) { if (grepl("does not exist", conditionMessage(e), ignore.case = TRUE)) { @@ -104,6 +104,29 @@ msg <- paste0("[ERROR] 🛑 Error while loading DHIS2 routine data file : ", rou } stop(msg) }) + +if (cast_year_month && all(c("YEAR", "MONTH") %in% names(dhis2_routine))) { +dhis2_routine[c("YEAR", "MONTH")] <- lapply(dhis2_routine[c("YEAR", "MONTH")], as.numeric) +} + +if (!is.null(required_fixed_cols)) { +missing_fixed <- setdiff(required_fixed_cols, colnames(dhis2_routine)) +if (length(missing_fixed) > 0) { +msg <- paste0("🚨 Missing fixed columns in routine data: ", paste(missing_fixed, collapse = ", ")) +log_msg(msg, "error") +stop(msg) +} +} + +if (!is.null(required_indicators)) { +missing_indicators <- setdiff(required_indicators, colnames(dhis2_routine)) +if (length(missing_indicators) > 0) { +msg <- paste0("🚨 Missing DHIS2 indicators in routine data: ", paste(missing_indicators, collapse = ", ")) +log_msg(msg, "error") +stop(msg) +} +} + log_msg(paste0("DHIS2 routine data : ", routine_filename, " loaded. Dims: ", paste(dim(dhis2_routine), collapse=", "))) return(head(dhis2_routine, 3)) } diff --git a/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb b/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb index 0b4457f..bd4e6b0 100644 --- a/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb +++ b/pipelines/snt_dhis2_population_transformation/code/snt_dhis2_population_transformation.ipynb @@ -140,16 +140,10 @@ "outputs": [], "source": [ "# Load file from dataset\n", - "dhis2_population <- tryCatch({ \n", - " get_latest_dataset_file_in_memory(format_dataset_id, paste0(COUNTRY_CODE, \"_population.parquet\")) \n", - " }, error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 population file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " log_msg(msg, \"error\")\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- glue(\"DHIS2 population data loaded from dataset : {format_dataset_id} dataframe dimensions: [{paste(dim(dhis2_population), collapse=', ')}]\")\n", - "log_msg(msg)" + "dhis2_population <- load_population_input_data(\n", + " dataset_name = format_dataset_id,\n", + " country_code = COUNTRY_CODE\n", + ")" ] }, { diff --git a/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r b/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r index 0a4a006..da77193 100644 --- a/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r +++ b/pipelines/snt_dhis2_population_transformation/utils/snt_dhis2_population_transformation.r @@ -27,6 +27,23 @@ bootstrap_population_transformation_context <- function( ) } +load_population_input_data <- function(dataset_name, country_code) { + tryCatch( + { + dhis2_population <- get_latest_dataset_file_in_memory(dataset_name, paste0(country_code, "_population.parquet")) + log_msg(glue::glue( + "DHIS2 population data loaded from dataset : {dataset_name} dataframe dimensions: [{paste(dim(dhis2_population), collapse=', ')}]" + )) + dhis2_population + }, + error = function(e) { + msg <- paste("[ERROR] Error while loading DHIS2 population file for:", country_code, conditionMessage(e)) + log_msg(msg, "error") + stop(msg) + } + ) +} + get_total_population_reference <- function(config_json, adjust_with_untotals = FALSE) { if (!adjust_with_untotals) { return(NULL) diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb index 82cb250..cf9939c 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb @@ -227,10 +227,10 @@ "\n", "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" + "spatial_data <- load_dhs_spatial_data(\n", + " dhis2_dataset = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")" ] }, { diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb index bcc6013..aed2f69 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb @@ -154,10 +154,10 @@ "\n", "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "spatial_data <- load_dhs_spatial_data(\n", + " dhis2_dataset = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")\n", "\n", "spatial_data <- st_as_sf(spatial_data)\n", "\n", diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb index fe9a51d..8c8fee9 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb @@ -134,10 +134,10 @@ "\n", "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "spatial_data <- load_dhs_spatial_data(\n", + " dhis2_dataset = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")\n", "\n", "spatial_data <- st_as_sf(spatial_data)\n", "\n", diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb index 53b0429..1797ed0 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb @@ -121,10 +121,10 @@ "\n", "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "spatial_data <- load_dhs_spatial_data(\n", + " dhis2_dataset = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")\n", "\n", "spatial_data <- st_as_sf(spatial_data)\n", "\n", diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb index 9ff3880..228ca75 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb @@ -174,10 +174,10 @@ "\n", "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "spatial_data <- load_dhs_spatial_data(\n", + " dhis2_dataset = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")\n", "\n", "spatial_data <- st_as_sf(spatial_data)\n", "\n", diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r index 53f523a..267f25a 100644 --- a/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r @@ -50,6 +50,22 @@ bootstrap_dhs_indicators_context <- function( ) } +load_dhs_spatial_data <- function(dhis2_dataset, country_code) { + spatial_data_filename <- paste(country_code, "shapes.geojson", sep = "_") + spatial_data <- tryCatch( + { + get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 shapes data for {country_code}: {conditionMessage(e)}") + log_msg(msg, "error") + stop(msg) + } + ) + log_msg(glue::glue("File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}")) + spatial_data +} + compute_and_export_indicator_table <- function( design_obj, indicator_name, diff --git a/pipelines/snt_map_extracts/reporting/snt_map_extracts_report.ipynb b/pipelines/snt_map_extracts/reporting/snt_map_extracts_report.ipynb index b53b685..2115b5a 100644 --- a/pipelines/snt_map_extracts/reporting/snt_map_extracts_report.ipynb +++ b/pipelines/snt_map_extracts/reporting/snt_map_extracts_report.ipynb @@ -96,20 +96,19 @@ "outputs": [], "source": [ "# import seasonality data\n", - "map_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, paste0(COUNTRY_CODE, \"_map_data.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", + "map_data <- load_map_report_input(\n", + " dataset_name = DATASET_NAME,\n", + " filename = paste0(COUNTRY_CODE, \"_map_data.parquet\"),\n", + " label = \"MAP extracted data\"\n", + ")\n", + "\n", "# import DHIS2 shapes data\n", "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", + "shapes_data <- load_map_report_input(\n", + " dataset_name = DATASET_DHIS2,\n", + " filename = paste0(COUNTRY_CODE, \"_shapes.geojson\"),\n", + " label = \"DHIS2 shapes data\"\n", + ")\n", "\n", "printdim(map_data)" ] diff --git a/pipelines/snt_map_extracts/utils/snt_map_extracts_report.r b/pipelines/snt_map_extracts/utils/snt_map_extracts_report.r index eed83a0..05819b7 100644 --- a/pipelines/snt_map_extracts/utils/snt_map_extracts_report.r +++ b/pipelines/snt_map_extracts/utils/snt_map_extracts_report.r @@ -2,6 +2,21 @@ printdim <- function(df, name = deparse(substitute(df))) { cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") } +load_map_report_input <- function(dataset_name, filename, label = "dataset file") { + data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, filename) + }, + error = function(e) { + msg <- paste("Error while loading", label, "for file:", filename, conditionMessage(e)) + cat(msg) + stop(msg) + } + ) + log_msg(paste0(label, " loaded from dataset: ", dataset_name, " dataframe dimensions: ", paste(dim(data), collapse = ", "))) + data +} + build_metric_plots <- function(map_data_joined, metrics) { purrr::map(metrics, function(metric) { diff --git a/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb b/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb index 96527b2..87663d7 100644 --- a/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb +++ b/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb @@ -380,10 +380,12 @@ }, "outputs": [], "source": [ - "# Load spatial file from dataset\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" + "# Load spatial/routine inputs from dataset\n", + "seasonality_inputs <- load_seasonality_input_data(\n", + " dataset_name = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")\n", + "spatial_data <- seasonality_inputs$spatial_data" ] }, { @@ -402,10 +404,7 @@ }, "outputs": [], "source": [ - "# Load routine data from dataset\n", - "case_data_filename <- paste(COUNTRY_CODE, \"routine.parquet\", sep = \"_\")\n", - "original_dt <- get_latest_dataset_file_in_memory(dhis2_dataset, case_data_filename)\n", - "log_msg(glue(\"File {case_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" + "original_dt <- seasonality_inputs$original_dt" ] }, { diff --git a/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r b/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r index 2ef6035..ca55b9a 100644 --- a/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r +++ b/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r @@ -36,6 +36,37 @@ bootstrap_seasonality_cases_context <- function( ) } +load_seasonality_input_data <- function(dataset_name, country_code) { + spatial_filename <- paste(country_code, "shapes.geojson", sep = "_") + routine_filename <- paste(country_code, "routine.parquet", sep = "_") + + spatial_data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, spatial_filename) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 shapes file for {country_code}: {conditionMessage(e)}") + log_msg(msg, level = "error") + stop(msg) + } + ) + log_msg(glue::glue("File {spatial_filename} successfully loaded from dataset version: {dataset_name}")) + + original_dt <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, routine_filename) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine file for {country_code}: {conditionMessage(e)}") + log_msg(msg, level = "error") + stop(msg) + } + ) + log_msg(glue::glue("File {routine_filename} successfully loaded from dataset version: {dataset_name}")) + + list(spatial_data = spatial_data, original_dt = original_dt) +} + compute_cases_proportion <- function(admin_id, block_duration, row_data, annual_data, admin_col, year_column) { if (is.na(block_duration) || is.infinite(block_duration)) { return(NA_real_) diff --git a/pipelines/snt_worldpop_extract/reporting/snt_worldpop_extract_report.ipynb b/pipelines/snt_worldpop_extract/reporting/snt_worldpop_extract_report.ipynb index 2fba875..b60cddd 100644 --- a/pipelines/snt_worldpop_extract/reporting/snt_worldpop_extract_report.ipynb +++ b/pipelines/snt_worldpop_extract/reporting/snt_worldpop_extract_report.ipynb @@ -132,15 +132,11 @@ "outputs": [], "source": [ "# Load worldpop population\n", - "worldpop_population <- tryCatch({ get_latest_dataset_file_in_memory(worldpop_dataset, parquet_file) },\n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading WorldPop population file \",parquet_file,\" for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"WorldPop population file \",parquet_file,\" loaded from dataset : \", worldpop_dataset, \" dataframe dimensions: \", paste(dim(worldpop_population), collapse=\", \"))\n", - "log_msg(msg)" + "worldpop_population <- load_worldpop_report_input(\n", + " dataset_name = worldpop_dataset,\n", + " filename = parquet_file,\n", + " label = \"WorldPop population file\"\n", + ")" ] }, { @@ -155,15 +151,11 @@ "outputs": [], "source": [ "# Load DHIS2 population\n", - "dhis2_population <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_formatted_dataset, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 population file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"DHIS2 population data loaded from dataset : \", dhis2_formatted_dataset, \" dataframe dimensions: \", paste(dim(dhis2_population), collapse=\", \"))\n", - "log_msg(msg)" + "dhis2_population <- load_worldpop_report_input(\n", + " dataset_name = dhis2_formatted_dataset,\n", + " filename = paste0(COUNTRY_CODE, \"_population.parquet\"),\n", + " label = \"DHIS2 population file\"\n", + ")" ] }, { @@ -178,14 +170,11 @@ "outputs": [], "source": [ "# Load DHIS2 shapes data\n", - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_formatted_dataset, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "msg <- paste0(\"DHIS2 shapes data loaded from dataset : \", dhis2_formatted_dataset, \" dataframe dimensions: \", paste(dim(shapes_data), collapse=\", \"))\n", - "log_msg(msg)" + "shapes_data <- load_worldpop_report_input(\n", + " dataset_name = dhis2_formatted_dataset,\n", + " filename = paste0(COUNTRY_CODE, \"_shapes.geojson\"),\n", + " label = \"DHIS2 shapes data\"\n", + ")" ] }, { diff --git a/pipelines/snt_worldpop_extract/utils/snt_worldpop_extract_report.r b/pipelines/snt_worldpop_extract/utils/snt_worldpop_extract_report.r index c9dda99..64158c2 100644 --- a/pipelines/snt_worldpop_extract/utils/snt_worldpop_extract_report.r +++ b/pipelines/snt_worldpop_extract/utils/snt_worldpop_extract_report.r @@ -10,6 +10,21 @@ find_country_parquet_file <- function(dataset_last_version, country_code) { parquet_file } +load_worldpop_report_input <- function(dataset_name, filename, label = "dataset file") { + data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, filename) + }, + error = function(e) { + msg <- paste("Error while loading", label, filename, conditionMessage(e)) + cat(msg) + stop(msg) + } + ) + log_msg(paste0(label, " loaded from dataset: ", dataset_name, " dataframe dimensions: ", paste(dim(data), collapse = ", "))) + data +} + get_comparison_years <- function(worldpop_population, dhis2_population) { worldpop_year <- min(worldpop_population$YEAR)