From dca051572f148d380f37fbcbc19a0b5e5e794648 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 19 Mar 2026 17:00:43 +0100 Subject: [PATCH 1/3] refactor(pipeline): update routine data choice parameters and streamline file resolution logic - Revised routine data choice options for clarity and consistency across pipelines. - Removed deprecated outlier processing method parameter and integrated its functionality into the routine data choice. - Enhanced file resolution logic to accommodate new routine data choices, ensuring correct file handling based on user selection. --- snt_dhis2_incidence/pipeline.py | 14 +- snt_dhis2_reporting_rate_dataset/pipeline.py | 128 ++++++++----------- 2 files changed, 63 insertions(+), 79 deletions(-) diff --git a/snt_dhis2_incidence/pipeline.py b/snt_dhis2_incidence/pipeline.py index 9e2fe1e..d4d6f38 100644 --- a/snt_dhis2_incidence/pipeline.py +++ b/snt_dhis2_incidence/pipeline.py @@ -24,10 +24,11 @@ @parameter( "routine_data_choice", name="Routine data to use", - help="Which routine data to use for the analysis. Options: 'raw' data is simply formatted and aligned;" - "'raw_without_outliers' is the raw data after outliers removed;" - " 'imputed' contains imputed values after outliers removal", - choices=["raw", "raw_without_outliers", "imputed"], + help="Which routine data to use for the analysis. " + "'raw' loads formatted routine data, " + "'imputed' loads outliers-imputed routine data, " + "'outliers_removed' loads routine data with outliers removed.", + choices=["raw", "imputed", "outliers_removed"], type=str, default="imputed", required=True, @@ -120,9 +121,12 @@ def snt_dhis2_incidence( "Pregnant women": "PREGNANT_WOMAN", } + notebook_routine_data_choice = ( + "raw_without_outliers" if routine_data_choice == "outliers_removed" else routine_data_choice + ) notebook_params = { "N1_METHOD": n1_method, - "ROUTINE_DATA_CHOICE": routine_data_choice, + "ROUTINE_DATA_CHOICE": notebook_routine_data_choice, "USE_CSB_DATA": use_csb_data, "USE_ADJUSTED_POPULATION": use_adjusted_population, "DISAGGREGATION_SELECTION": ( diff --git a/snt_dhis2_reporting_rate_dataset/pipeline.py b/snt_dhis2_reporting_rate_dataset/pipeline.py index 819c89f..f6052fc 100644 --- a/snt_dhis2_reporting_rate_dataset/pipeline.py +++ b/snt_dhis2_reporting_rate_dataset/pipeline.py @@ -15,36 +15,18 @@ @pipeline("snt_dhis2_reporting_rate_dataset") @parameter( - "outliers_method", - name="Outlier processing method", - help="Specify which method was used to detect outliers in routine data. " - "Chose 'Routine data (Raw)' to use raw routine data.", + "routine_data_choice", + name="Routine data source", + help="Select which routine data to use. " + "'raw' loads formatted routine data, " + "'imputed' loads outliers-imputed routine data, " + "'outliers_removed' loads routine data with outliers removed.", multiple=False, - choices=[ - "Routine data (Raw)", - "Mean (Classic)", - "Median (Classic)", - "IQR (Classic)", - "Trend (PATH)", - "MG Partial (MagicGlasses2)", - "MG Complete (MagicGlasses2)", - ], + choices=["raw", "imputed", "outliers_removed"], type=str, - default="Routine data (Raw)", + default="imputed", required=True, ) -@parameter( - "use_removed_outliers", - name="Use routine data with outliers removed (else: uses imputed)", - help="Enable this option to use routine data after outliers have been removed, " - "based on the outlier detection method you selected above. " - " If you leave this off, the pipeline will instead use either:" - " A) the imputed routine data (where outlier values have been replaced), or" - " B) the raw routine data, if you chose 'Routine data (Raw)' as your outlier processing method.", - type=bool, - default=False, - required=False, -) @parameter( "run_report_only", name="Run reporting notebook only", @@ -66,7 +48,7 @@ required=False, ) def snt_dhis2_reporting_rate_dataset( - outliers_method: list, use_removed_outliers: bool, run_report_only: bool, pull_scripts: bool + routine_data_choice: str, run_report_only: bool, pull_scripts: bool ): """Orchestration function. Calls other functions within the pipeline.""" if pull_scripts: @@ -90,19 +72,27 @@ def snt_dhis2_reporting_rate_dataset( country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] if not run_report_only: - routine_file = resolve_routine_filename(outliers_method, use_removed_outliers) - routine_file = f"{country_code}{routine_file}" - if outliers_method == "Routine data (Raw)": + routine_file_candidates = resolve_routine_file_candidates( + country_code=country_code, routine_data_choice=routine_data_choice + ) + if routine_data_choice == "raw": ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_DATASET_FORMATTED"] else: ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] - # Check the file exists in the dataset - if not dataset_file_exists(ds_id=ds_outliers_id, filename=routine_file): + routine_file = next( + ( + filename + for filename in routine_file_candidates + if dataset_file_exists(ds_id=ds_outliers_id, filename=filename) + ), + None, + ) + if routine_file is None: current_run.log_warning( - f"Routine file {routine_file} not found in the dataset {ds_outliers_id}, " - "perhaps the outliers imputation pipeline has not been run yet. " - "Processing cannot continue." + f"None of the expected routine files were found in dataset {ds_outliers_id}: " + f"{routine_file_candidates}. " + "Perhaps the outliers-imputation pipeline has not been run yet. Processing cannot continue." ) return @@ -112,7 +102,7 @@ def snt_dhis2_reporting_rate_dataset( } params_file = save_pipeline_parameters( - pipeline_name="snt_dhis2_reporting_rate_dataelement", + pipeline_name="snt_dhis2_reporting_rate_dataset", parameters=nb_parameters, output_path=data_path, country_code=country_code, @@ -153,44 +143,34 @@ def snt_dhis2_reporting_rate_dataset( raise -def resolve_routine_filename(outliers_method: str, is_removed: bool) -> str: - """Returns the routine data filename based on the selected outliers method. - - Parameters - ---------- - outliers_method : str - The method used for outlier removal. - is_removed : bool - Whether to return the filename for removed outliers or imputed outliers. - - Returns - ------- - str - The filename corresponding to the selected outliers method. - - Raises - ------ - ValueError - If the outliers method is unknown. - """ - if outliers_method == "Routine data (Raw)": - return "_routine.parquet" - - method_suffix_map = { - "Mean (Classic)": "mean", - "Median (Classic)": "median", - "IQR (Classic)": "iqr", - "Trend (PATH)": "trend", - "MG Partial (MagicGlasses2)": "mg-partial", - "MG Complete (MagicGlasses2)": "mg-complete", - } - - try: - suffix = method_suffix_map[outliers_method] - except KeyError as err: - raise ValueError(f"Unknown outliers method: {outliers_method}") from err - - return f"_routine_outliers-{suffix}{'_removed' if is_removed else '_imputed'}.parquet" +def resolve_routine_file_candidates(country_code: str, routine_data_choice: str) -> list[str]: + """Returns ordered candidate filenames for a routine data choice.""" + if routine_data_choice == "raw": + return [f"{country_code}_routine.parquet"] + + if routine_data_choice == "imputed": + return [ + f"{country_code}_routine_outliers_imputed.parquet", + f"{country_code}_routine_outliers-mean_imputed.parquet", + f"{country_code}_routine_outliers-median_imputed.parquet", + f"{country_code}_routine_outliers-iqr_imputed.parquet", + f"{country_code}_routine_outliers-trend_imputed.parquet", + f"{country_code}_routine_outliers-mg-partial_imputed.parquet", + f"{country_code}_routine_outliers-mg-complete_imputed.parquet", + ] + + if routine_data_choice == "outliers_removed": + return [ + f"{country_code}_routine_outliers_removed.parquet", + f"{country_code}_routine_outliers-mean_removed.parquet", + f"{country_code}_routine_outliers-median_removed.parquet", + f"{country_code}_routine_outliers-iqr_removed.parquet", + f"{country_code}_routine_outliers-trend_removed.parquet", + f"{country_code}_routine_outliers-mg-partial_removed.parquet", + f"{country_code}_routine_outliers-mg-complete_removed.parquet", + ] + + raise ValueError(f"Unknown routine data choice: {routine_data_choice}") if __name__ == "__main__": From bc87c40923e47289938ba16685ed8a6706c03268 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 20 Mar 2026 11:10:16 +0100 Subject: [PATCH 2/3] test --- .../code/snt_dhis2_incidence.ipynb | 3288 ++++++++--------- .../snt_dhis2_incidence_NER.ipynb | 3288 ++++++++--------- .../snt_dhis2_reporting_rate_dataset.ipynb | 2312 ++++++------ snt_dhis2_incidence/pipeline.py | 5 +- snt_dhis2_reporting_rate_dataset/pipeline.py | 41 +- 5 files changed, 4455 insertions(+), 4479 deletions(-) diff --git a/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb b/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb index 8ae33d1..9b0ff8e 100644 --- a/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb +++ b/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb @@ -1,1649 +1,1649 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "f5827740-2917-4504-9017-9ec7d408e5f4", - "metadata": {}, - "source": [ - "Script structure:\n", - " 1. Setup:\n", - " * Paths\n", - " * Utils functions\n", - " * Load and check config file\n", - " 2. Load Data\n", - " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", - " * **Population data** (DHIS2) already formatted & aggregated (output of pipeline YYY) & aggregated at **ADM2 x YEAR** level
\n", - " **Note**: in some Countries (i.e., Niger), population and and crude incidence data is also available for **specific sections** of the popultion (i.e., preganant women, children under 5)\n", - " * (optional) **Care seeking (taux recherche soins)** (DHS)\n", - " * **Reporting Rate**, based on what is available (last run reporting rate pipeline), uses _either_ one of:\n", - " * \"**Dataset**\": pre-cumputed (directly downloadable from SNIS DHIS2 instance) and formatted&aligned elsewhere (output of pipelibe `dhis2-reporting-rate`)\n", - " * \"**Data Element**: calculated from routine DHIS2 data, based on reports for defined indicators and \"active\" facilities\n", - " 3. Calculate **Incidence**\n", - " 1. calculate **monthly cases**\n", - " 2. calculate **yearly incidence**: Crude, Adjusted 1 (Test Positivity Rate), Adjusted 2 (Reporting Rate), (optional) Adjusted 3 (Care Seeking Behaviour)" - ] - }, - { - "cell_type": "markdown", - "id": "cdd5409b-dc0e-45f4-ae4e-dffcdb25059b", - "metadata": {}, - "source": [ - "-------------------\n", - "**Naming harmonization to improve code readability:**\n", - "\n", - "**Incidence**, COLUMN NAMES (always capitalized!):\n", - "* \"INCIDENCE_CRUDE\" = \"Crude\"\n", - "* \"INCIDENCE_ADJ_TESTING\" = \"Adjusted 1 (Testing)\"\n", - "* \"INCIDENCE_ADJ_REPORTING\" = \"Adjusted 2 (Reporting)\"\n", - "* _\"INCIDENCE_ADJ_CARESEEKING\" = \"Adjusted 3 (Careseeking)\"_ ⚠️is this good naming?" - ] - }, - { - "cell_type": "markdown", - "id": "96d5dffc-ff34-4a14-b2b7-1e71e6afad07", - "metadata": {}, - "source": [ - "**Reporting Rate** data frames, based on two **methods**:\n", - "* follwo this structure: reporting\\_rate\\_\\. So:\n", - " * **Dataset**: `reporting_rate_dataset` (for report nb only: `reporting_rate_dataset_year`)\n", - " * **Data Element** (Diallo 2025): `reporting_rate_dataelement` (for report nb only: `reporting_rate_dataelement_year`)" - ] - }, - { - "cell_type": "markdown", - "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", - "metadata": {}, - "source": [ - "--------------------" - ] - }, - { - "cell_type": "markdown", - "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", - "metadata": {}, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "markdown", - "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", - "metadata": {}, - "source": [ - "### 1.0. Fallback parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72fad25e-85fd-4ae9-8fe3-c142077f8d67", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ----- ⚡ Defined in pipeline.py code ---------------\n", - "if (!exists(\"N1_METHOD\")) N1_METHOD <- \"SUSP-TEST\" # ⚡ For N1 calculations: use `SUSP-TEST` or `PRES`\n", - "if (!exists(\"ROUTINE_DATA_CHOICE\")) ROUTINE_DATA_CHOICE <- \"raw\" # \"raw\" \"raw_without_outliers\" \"imputed\"\n", - "if (!exists(\"USE_CSB_DATA\")) USE_CSB_DATA <- FALSE # ⚡ USE_CSB_DATA bool\n", - "if (!exists(\"USE_ADJUSTED_POPULATION\")) USE_ADJUSTED_POPULATION <- FALSE # ⚡ USE_ADJUSTED_POPULATION bool " - ] - }, - { - "cell_type": "markdown", - "id": "d7d2f065-f8ad-4580-aa24-64a6d9bd7acb", - "metadata": {}, - "source": [ - "#### 👥 Population Disaggregation \n", - "Only for countries in which disaggregated data is available. Pipeline fails if you select something you don't have." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63362c4a-6a55-4310-aa7a-81bea39aa734", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!exists(\"DISAGGREGATION_SELECTION\")) DISAGGREGATION_SELECTION <- \"UNDER_5\" # NULL # Options: \"PREGNANT_WOMAN\", \"UNDER_5\", ... \n", - "# Disaggregation options set in pipeline.py parameters, based on \n", - "# https://bluesquare.atlassian.net/browse/SNT25-363?focusedCommentId=85587" - ] - }, - { - "cell_type": "markdown", - "id": "ecff4a51-c6fa-4e84-a465-5bb87d9b1333", - "metadata": {}, - "source": [ - "### 1.1. Run setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# PROJECT PATHS\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2', 'incidence') # store the output of the pipeline (only final results)\n", - "INTERMEDIATE_DATA_PATH <- file.path(DATA_PATH, \"intermediate_results\") # intermediate results for reporting nb or else, NOT for OH Dataset!\n", - "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\")) # utils\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\")) # palettes \n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right path\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22dbb20b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!dir.exists(INTERMEDIATE_DATA_PATH)) {\n", - " dir.create(INTERMEDIATE_DATA_PATH, recursive = TRUE)\n", - " log_msg(glue(\"Created directory for intermediate results: {INTERMEDIATE_DATA_PATH}\"))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", - "metadata": {}, - "source": [ - "### 1.2. Load and check `config` file\n", - "\n", - "**Checks for SNT mandatory configuration fields**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")) \n", - "log_msg(msg)\n", - "\n", - "# Generic\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"ANY\"\n", - "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "\n", - "# Fixed routine formatting columns\n", - "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') \n", - "print(paste(\"Fixed routine data ('dhis2_routine') columns (always expected): \", paste(fixed_cols, collapse=\", \")))" - ] - }, - { - "cell_type": "markdown", - "id": "95006478", - "metadata": {}, - "source": [ - "### 1.3. Helper function(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa504ca5-928c-4778-ad31-5c4de7bbbf60", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# helper function \n", - "resolve_routine_filename <- function(routine_choice) { \n", - " if (routine_choice == \"raw\") return(\"_routine.parquet\")\n", - " is_removed <- FALSE\n", - " if (routine_choice == \"raw_without_outliers\") is_removed <- TRUE \n", - " removed_status <- if (is_removed) \"removed\" else \"imputed\" \n", - " return(glue::glue(\"_routine_outliers_{removed_status}.parquet\"))\n", - "} " - ] - }, - { - "cell_type": "markdown", - "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", - "metadata": {}, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", - "metadata": {}, - "source": [ - "### 2.1. **Routine** data (DHIS2) (parametrized choice)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ddb31b18", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# select routine dataset and filename\n", - "if (ROUTINE_DATA_CHOICE == \"raw\") { \n", - " routine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - " routine_name <- resolve_routine_filename(ROUTINE_DATA_CHOICE)\n", - " routine_filename <- paste0(COUNTRY_CODE, routine_name)\n", - "} else { \n", - " routine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - " routine_name <- resolve_routine_filename(ROUTINE_DATA_CHOICE)\n", - " routine_filename <- paste0(COUNTRY_CODE, routine_name)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30691d35-f859-4f92-8eb2-5791a425f153", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load file from dataset \n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(routine_dataset_name, routine_filename) }, \n", - " error = function(e) { \n", - " # Check if the error message indicates that the file does not exist \n", - " if (grepl(\"does not exist\", conditionMessage(e), ignore.case = TRUE)) { \n", - " msg <- paste0(\"[ERROR] File not found! 🛑 The file `\", routine_filename, \"` does not exist in `\", \n", - " routine_dataset_name, \"`. To generate it, execute the pipeline `DHIS2 Outliers Removal and Imputation`, choosing the appropriate method.\")\n", - " } else {\n", - " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file for: \", COUNTRY_CODE, \". [ERROR DETAILS] \" , conditionMessage(e))\n", - " } \n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"DHIS2 routine data : `\", routine_filename, \"` loaded from dataset : `\", routine_dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", - "log_msg(msg)\n", - "\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "b78c12ec-407f-4088-9a7f-08838b2d208b", - "metadata": {}, - "source": [ - "#### Checks on routine data columns" - ] - }, - { - "cell_type": "markdown", - "id": "b1dcb02d", - "metadata": {}, - "source": [ - " `fixed_cols`: Fixed columns that should be always present regardless of the config." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b3514f20-3726-436e-b34b-7a171d1718d4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if all \"fixed\" cols are present in dhis2_routine\n", - "actual_cols <- colnames(dhis2_routine) # dhis2_routine\n", - "missing_cols <- setdiff(fixed_cols, actual_cols) # Columns in fixed_cols but not in actual_cols)\n", - "\n", - "# Check if all required columns are present\n", - "all_present <- length(missing_cols) == 0\n", - "if (all_present) { \n", - " log_msg(paste0(\"The 'dhis2_routine' tibble contains all the expected 'fixed' columns: \", paste(fixed_cols, collapse = \", \"), \".\"))\n", - "} else {\n", - " log_msg(paste0(\"🚨 Missing Columns: The following required columns are NOT present in 'dhis2_routine': \", paste(missing_cols, collapse = \", \"), \".\"), \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "cd203dec-61b2-4510-9c84-30054e7b99e2", - "metadata": {}, - "source": [ - "`DHIS2_INDICATORS`: Indicators, as defined in the config.json file, are expected to be present if the extraction pipeline and this pipeline are run on the same config settings." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb04b888-8c5e-452a-8eb4-96025b0fa65a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if all \"DHIS2_INDICATORS\" cols are present in dhis2_routine\n", - "missing_cols <- setdiff(DHIS2_INDICATORS, actual_cols) # all elements in DHIS2_INDICATORS but not in actual_cols\n", - "all_present <- length(missing_cols) == 0\n", - "if (all_present) { \n", - " log_msg(paste0(\"The 'dhis2_routine' tibble contains all the expected 'DHIS2_INDICATORS' columns: \", paste(DHIS2_INDICATORS, collapse = \", \"), \".\"))\n", - "} else {\n", - " log_msg(paste0(\n", - " \"🚨 Missing Columns: The following columns for DHIS2 INDICATORS are NOT present in 'dhis2_routine': \",\n", - " paste(missing_cols, collapse = \", \"),\n", - " \".\\n🚨 Looks like the config.json file was modified after extraction.\\n🚨 The analysis will continue WITHOUT the missing indicators.\"\n", - " ), \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3ba1a6e8-aa08-4624-a6a5-4852bf4127e4", - "metadata": {}, - "source": [ - "#### Checks on `N1_METHOD` selected\n", - "_**if**_ `N1_METHOD == PRES` then `PRES` must exist in config.json file _and_ in routine data
\n", - "_**else**_ N1 will use `SUSP-TEST` instead" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "96a7025e-083b-464d-8498-f7fdff493293", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check that col `PRES` exists in both config file and routine data\n", - "if (N1_METHOD == \"PRES\") {\n", - " pres_in_routine <- any(names(dhis2_routine) == \"PRES\")\n", - " pres_in_config <- any(DHIS2_INDICATORS == \"PRES\")\n", - "\n", - " if (!pres_in_routine) {\n", - " log_msg(\"🛑 Column `PRES` missing from routine data! 🚨 N1 calculations will use `SUSP-TEST` instead!\", \"error\")\n", - " stop()\n", - " }\n", - " if (!pres_in_config) {\n", - " log_msg(\"⚙️ Note: `PRES` set as parameter in this pipeline, but not defined as indicator in the configuration file (SNT_config.json)\", \"error\")\n", - " stop()\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "1c5e84cf", - "metadata": {}, - "source": [ - "#### 👥 Population Disaggregation logic" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "458e3d78-3552-4447-93f8-6812a5d655be", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "INDICATORS_FOUND <- FALSE # 👈 \n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && !is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c(\"SUSP-TEST\", \"PRES\")) {\n", - "if (!is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c(\"SUSP-TEST\", \"PRES\")) {\n", - "\n", - " # Determine the dynamic prefix based on the method\n", - " prefix_method <- ifelse(N1_METHOD == \"SUSP-TEST\", \"SUSP\", \"PRES\")\n", - " prefix_all <- c(prefix_method, \"TEST\", \"CONF\") \n", - " # Define the expected column names \n", - " # (also make available for the 'else' warning message if the check fails)\n", - " target_colnames <- glue(\"{prefix_all}_{DISAGGREGATION_SELECTION}\")\n", - " \n", - " if (all(target_colnames %in% colnames(dhis2_routine))) {\n", - " \n", - " # We map the specific columns (e.g., SUSP_UNDER5) to generic names (e.g., SUSP)\n", - " dhis2_routine[prefix_all] <- dhis2_routine[target_colnames]\n", - " \n", - " for (col in target_colnames) {\n", - " log_msg(glue(\"Population Disaggregation: Successfully mapped indicator: {col}\"))\n", - " }\n", - " \n", - " # Signal success for the next code block\n", - " INDICATORS_FOUND <- TRUE\n", - " \n", - " } else {\n", - " missing_cols <- setdiff(target_colnames, colnames(dhis2_routine))\n", - " log_msg(glue(\"Population Disaggregation: Disaggregation on '{DISAGGREGATION_SELECTION}' failed.\"), \"warning\")\n", - " log_msg(glue(\"Population Disaggregation: Missing columns in routine dataset: {paste(missing_cols, collapse = ', ')}\"), \"warning\")\n", - " \n", - " msg <- glue(\"[ERROR] 🛑 Population Disaggregation: Required columns for disaggregation '{DISAGGREGATION_SELECTION}' are missing.\") \n", - " stop(msg)\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "4473e75e-94d2-4f24-b6eb-38a7685542ad", - "metadata": {}, - "source": [ - "### 2.2. Load population data at level ADM2 x YEAR\n", - "\n", - "Already formatted & aggregated. \n", - "\n", - "**Expecting** table with these **cols** (bold = **must have**): \n", - "* ADM1_ID\n", - "* **ADM2_ID**\n", - "* **YEAR**\n", - "* **POPULATION** (pop at ADM2 level)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ce922c7-6dab-44cf-a94f-8a03d1f816a1", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Select population file \n", - "if (USE_ADJUSTED_POPULATION) {\n", - " dhis2_pop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_POPULATION_TRANSFORMATION\n", - "} else {\n", - " dhis2_pop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "}\n", - " \n", - "# Load file from dataset\n", - "dhis2_population_adm2 <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_pop_dataset, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 population file for: \" , COUNTRY_CODE, \n", - " \" [ERROR DETAILS] \", conditionMessage(e)) # log error message , \n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "log_msg(glue(\"DHIS2 population data loaded from dataset: {dhis2_pop_dataset}. Dataframe dimensions: {paste(dim(dhis2_population_adm2), collapse=', ')}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7163965", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhis2_population_adm2 |> head()" - ] - }, - { - "cell_type": "markdown", - "id": "6ae0c5fa", - "metadata": {}, - "source": [ - "#### 👥 Population Disaggregation logic" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c8620491", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (INDICATORS_FOUND) { \n", - " POPULATION_SELECTION <- paste0(\"POP_\", DISAGGREGATION_SELECTION) \n", - " if (!(POPULATION_SELECTION %in% colnames(dhis2_population_adm2))) {\n", - " log_msg(glue(\"Population Disaggregation: Column '{POPULATION_SELECTION}' not found in Population dataset.\"), \"warning\")\n", - " POPULATION_SELECTION <- \"POPULATION\"\n", - " }\n", - " # The selected column is assigned to POPULATION col so that later code can use it generically\n", - " dhis2_population_adm2$POPULATION <- dhis2_population_adm2[[POPULATION_SELECTION]]\n", - " log_msg(glue(\"Population Disaggregation: Column '{POPULATION_SELECTION}' selected as population values.\"))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e596d0ed-56df-4756-83ed-717cfa72f643", - "metadata": {}, - "source": [ - "#### 2.2.1 **Population** data (DHIS2) columns selection.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5107756-f007-4c39-a4f6-b2ab0a653bd5", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhis2_population_adm2 <- dhis2_population_adm2 %>% select(YEAR, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, POPULATION) \n", - "\n", - "dim(dhis2_population_adm2)\n", - "head(dhis2_population_adm2, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "b42a65ab-ad8d-41ba-9edb-dc2636f03a06", - "metadata": {}, - "source": [ - "### 2.3. (optional) **Care Seeking Behaviour** (CSB DHS) (taux recherche soins)\n", - "(20250728) Note: **changed units** (proportion to %), see https://bluesquare.atlassian.net/browse/SNT25-127 " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a6a5338-9ffd-47d2-b92f-79deb7886078", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHS_INDICATORS\n", - "file_name <- glue::glue(\"{COUNTRY_CODE}_DHS_ADM1_PCT_CARESEEKING_SAMPLE_AVERAGE.parquet\")\n", - "\n", - "if (USE_CSB_DATA == TRUE) {\n", - " # Read the data, if error (cannot find at defined path) -> set careseeking_data to NULL (so it doesn't break the function at # 3.)\n", - " careseeking_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"🛑 Error while loading DHS Care Seeking data file from `\", dataset_name, file_name ,\"`.\", conditionMessage(e)) # log error message\n", - " log_msg(msg, \"error\")\n", - " return(NULL) # make object NULL on error\n", - " })\n", - " \n", - " # Only print success messages and data info if careseeking_data is NOT NULL\n", - " if (!is.null(careseeking_data)) {\n", - " log_msg(paste0(\"Care Seeking data : \", file_name, \" loaded from dataset : \", dataset_name))\n", - " log_msg(paste0(\"Care Seeking data frame dimensions: \", nrow(careseeking_data), \" rows, \", ncol(careseeking_data), \" columns.\"))\n", - " head(careseeking_data)\n", - " } else {\n", - " log_msg(paste0(\"🚨 Care-seeking data not loaded due to an error, `careseeking_data` is set to `NULL`!\"), \"warning\")\n", - " }\n", - " \n", - "} else {\n", - " # if `USE_CSB_DATA == FALSE` ... (basically, ignore CSB data)\n", - " careseeking_data <- NULL\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "92723594-000b-41ee-82a1-8e69106a277d", - "metadata": {}, - "source": [ - "### 2.4. Load Reporting Rate \n", - "\n", - "Import Reporting Rate file based on what is available in the latest OH Dataset version (which depends on last run reporting rate pipepline).\n", - "\n", - "📅 **Important**: reporting rate must be **monthly**!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5722d64-ce61-4244-960e-57ebac28e4cf", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "f5827740-2917-4504-9017-9ec7d408e5f4", + "metadata": {}, + "source": [ + "Script structure:\n", + " 1. Setup:\n", + " * Paths\n", + " * Utils functions\n", + " * Load and check config file\n", + " 2. Load Data\n", + " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", + " * **Population data** (DHIS2) already formatted & aggregated (output of pipeline YYY) & aggregated at **ADM2 x YEAR** level
\n", + " **Note**: in some Countries (i.e., Niger), population and and crude incidence data is also available for **specific sections** of the popultion (i.e., preganant women, children under 5)\n", + " * (optional) **Care seeking (taux recherche soins)** (DHS)\n", + " * **Reporting Rate**, based on what is available (last run reporting rate pipeline), uses _either_ one of:\n", + " * \"**Dataset**\": pre-cumputed (directly downloadable from SNIS DHIS2 instance) and formatted&aligned elsewhere (output of pipelibe `dhis2-reporting-rate`)\n", + " * \"**Data Element**: calculated from routine DHIS2 data, based on reports for defined indicators and \"active\" facilities\n", + " 3. Calculate **Incidence**\n", + " 1. calculate **monthly cases**\n", + " 2. calculate **yearly incidence**: Crude, Adjusted 1 (Test Positivity Rate), Adjusted 2 (Reporting Rate), (optional) Adjusted 3 (Care Seeking Behaviour)" + ] + }, + { + "cell_type": "markdown", + "id": "cdd5409b-dc0e-45f4-ae4e-dffcdb25059b", + "metadata": {}, + "source": [ + "-------------------\n", + "**Naming harmonization to improve code readability:**\n", + "\n", + "**Incidence**, COLUMN NAMES (always capitalized!):\n", + "* \"INCIDENCE_CRUDE\" = \"Crude\"\n", + "* \"INCIDENCE_ADJ_TESTING\" = \"Adjusted 1 (Testing)\"\n", + "* \"INCIDENCE_ADJ_REPORTING\" = \"Adjusted 2 (Reporting)\"\n", + "* _\"INCIDENCE_ADJ_CARESEEKING\" = \"Adjusted 3 (Careseeking)\"_ ⚠️is this good naming?" + ] + }, + { + "cell_type": "markdown", + "id": "96d5dffc-ff34-4a14-b2b7-1e71e6afad07", + "metadata": {}, + "source": [ + "**Reporting Rate** data frames, based on two **methods**:\n", + "* follwo this structure: reporting\\_rate\\_\\. So:\n", + " * **Dataset**: `reporting_rate_dataset` (for report nb only: `reporting_rate_dataset_year`)\n", + " * **Data Element** (Diallo 2025): `reporting_rate_dataelement` (for report nb only: `reporting_rate_dataelement_year`)" + ] + }, + { + "cell_type": "markdown", + "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", + "metadata": {}, + "source": [ + "--------------------" + ] + }, + { + "cell_type": "markdown", + "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "markdown", + "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", + "metadata": {}, + "source": [ + "### 1.0. Fallback parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72fad25e-85fd-4ae9-8fe3-c142077f8d67", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ----- ⚡ Defined in pipeline.py code ---------------\n", + "if (!exists(\"N1_METHOD\")) N1_METHOD <- \"SUSP-TEST\" # ⚡ For N1 calculations: use `SUSP-TEST` or `PRES`\n", + "if (!exists(\"ROUTINE_DATA_CHOICE\")) ROUTINE_DATA_CHOICE <- \"raw\" # \"raw\" \"outliers_removed\" \"imputed\"\n", + "if (!exists(\"USE_CSB_DATA\")) USE_CSB_DATA <- FALSE # ⚡ USE_CSB_DATA bool\n", + "if (!exists(\"USE_ADJUSTED_POPULATION\")) USE_ADJUSTED_POPULATION <- FALSE # ⚡ USE_ADJUSTED_POPULATION bool " + ] + }, + { + "cell_type": "markdown", + "id": "d7d2f065-f8ad-4580-aa24-64a6d9bd7acb", + "metadata": {}, + "source": [ + "#### 👥 Population Disaggregation \n", + "Only for countries in which disaggregated data is available. Pipeline fails if you select something you don't have." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63362c4a-6a55-4310-aa7a-81bea39aa734", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!exists(\"DISAGGREGATION_SELECTION\")) DISAGGREGATION_SELECTION <- \"UNDER_5\" # NULL # Options: \"PREGNANT_WOMAN\", \"UNDER_5\", ... \n", + "# Disaggregation options set in pipeline.py parameters, based on \n", + "# https://bluesquare.atlassian.net/browse/SNT25-363?focusedCommentId=85587" + ] + }, + { + "cell_type": "markdown", + "id": "ecff4a51-c6fa-4e84-a465-5bb87d9b1333", + "metadata": {}, + "source": [ + "### 1.1. Run setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# PROJECT PATHS\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2', 'incidence') # store the output of the pipeline (only final results)\n", + "INTERMEDIATE_DATA_PATH <- file.path(DATA_PATH, \"intermediate_results\") # intermediate results for reporting nb or else, NOT for OH Dataset!\n", + "\n", + "source(file.path(CODE_PATH, \"snt_utils.r\")) # utils\n", + "source(file.path(CODE_PATH, \"snt_palettes.r\")) # palettes \n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Set environment to load openhexa.sdk from the right path\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22dbb20b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!dir.exists(INTERMEDIATE_DATA_PATH)) {\n", + " dir.create(INTERMEDIATE_DATA_PATH, recursive = TRUE)\n", + " log_msg(glue(\"Created directory for intermediate results: {INTERMEDIATE_DATA_PATH}\"))\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", + "metadata": {}, + "source": [ + "### 1.2. Load and check `config` file\n", + "\n", + "**Checks for SNT mandatory configuration fields**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")) \n", + "log_msg(msg)\n", + "\n", + "# Generic\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"ANY\"\n", + "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", + "\n", + "# Fixed routine formatting columns\n", + "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') \n", + "print(paste(\"Fixed routine data ('dhis2_routine') columns (always expected): \", paste(fixed_cols, collapse=\", \")))" + ] + }, + { + "cell_type": "markdown", + "id": "95006478", + "metadata": {}, + "source": [ + "### 1.3. Helper function(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa504ca5-928c-4778-ad31-5c4de7bbbf60", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# helper function \n", + "resolve_routine_filename <- function(routine_choice) { \n", + " if (routine_choice == \"raw\") return(\"_routine.parquet\")\n", + " is_removed <- FALSE\n", + " if (routine_choice == \"outliers_removed\") is_removed <- TRUE \n", + " removed_status <- if (is_removed) \"removed\" else \"imputed\" \n", + " return(glue::glue(\"_routine_outliers_{removed_status}.parquet\"))\n", + "} " + ] + }, + { + "cell_type": "markdown", + "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", + "metadata": {}, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", + "metadata": {}, + "source": [ + "### 2.1. **Routine** data (DHIS2) (parametrized choice)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddb31b18", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# select routine dataset and filename\n", + "if (ROUTINE_DATA_CHOICE == \"raw\") { \n", + " routine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + " routine_name <- resolve_routine_filename(ROUTINE_DATA_CHOICE)\n", + " routine_filename <- paste0(COUNTRY_CODE, routine_name)\n", + "} else { \n", + " routine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", + " routine_name <- resolve_routine_filename(ROUTINE_DATA_CHOICE)\n", + " routine_filename <- paste0(COUNTRY_CODE, routine_name)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30691d35-f859-4f92-8eb2-5791a425f153", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load file from dataset \n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(routine_dataset_name, routine_filename) }, \n", + " error = function(e) { \n", + " # Check if the error message indicates that the file does not exist \n", + " if (grepl(\"does not exist\", conditionMessage(e), ignore.case = TRUE)) { \n", + " msg <- paste0(\"[ERROR] File not found! 🛑 The file `\", routine_filename, \"` does not exist in `\", \n", + " routine_dataset_name, \"`. To generate it, execute the pipeline `DHIS2 Outliers Removal and Imputation`, choosing the appropriate method.\")\n", + " } else {\n", + " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file for: \", COUNTRY_CODE, \". [ERROR DETAILS] \" , conditionMessage(e))\n", + " } \n", + " stop(msg)\n", + "})\n", + "\n", + "msg <- paste0(\"DHIS2 routine data : `\", routine_filename, \"` loaded from dataset : `\", routine_dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", + "log_msg(msg)\n", + "\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "b78c12ec-407f-4088-9a7f-08838b2d208b", + "metadata": {}, + "source": [ + "#### Checks on routine data columns" + ] + }, + { + "cell_type": "markdown", + "id": "b1dcb02d", + "metadata": {}, + "source": [ + " `fixed_cols`: Fixed columns that should be always present regardless of the config." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3514f20-3726-436e-b34b-7a171d1718d4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if all \"fixed\" cols are present in dhis2_routine\n", + "actual_cols <- colnames(dhis2_routine) # dhis2_routine\n", + "missing_cols <- setdiff(fixed_cols, actual_cols) # Columns in fixed_cols but not in actual_cols)\n", + "\n", + "# Check if all required columns are present\n", + "all_present <- length(missing_cols) == 0\n", + "if (all_present) { \n", + " log_msg(paste0(\"The 'dhis2_routine' tibble contains all the expected 'fixed' columns: \", paste(fixed_cols, collapse = \", \"), \".\"))\n", + "} else {\n", + " log_msg(paste0(\"🚨 Missing Columns: The following required columns are NOT present in 'dhis2_routine': \", paste(missing_cols, collapse = \", \"), \".\"), \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "cd203dec-61b2-4510-9c84-30054e7b99e2", + "metadata": {}, + "source": [ + "`DHIS2_INDICATORS`: Indicators, as defined in the config.json file, are expected to be present if the extraction pipeline and this pipeline are run on the same config settings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb04b888-8c5e-452a-8eb4-96025b0fa65a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if all \"DHIS2_INDICATORS\" cols are present in dhis2_routine\n", + "missing_cols <- setdiff(DHIS2_INDICATORS, actual_cols) # all elements in DHIS2_INDICATORS but not in actual_cols\n", + "all_present <- length(missing_cols) == 0\n", + "if (all_present) { \n", + " log_msg(paste0(\"The 'dhis2_routine' tibble contains all the expected 'DHIS2_INDICATORS' columns: \", paste(DHIS2_INDICATORS, collapse = \", \"), \".\"))\n", + "} else {\n", + " log_msg(paste0(\n", + " \"🚨 Missing Columns: The following columns for DHIS2 INDICATORS are NOT present in 'dhis2_routine': \",\n", + " paste(missing_cols, collapse = \", \"),\n", + " \".\\n🚨 Looks like the config.json file was modified after extraction.\\n🚨 The analysis will continue WITHOUT the missing indicators.\"\n", + " ), \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3ba1a6e8-aa08-4624-a6a5-4852bf4127e4", + "metadata": {}, + "source": [ + "#### Checks on `N1_METHOD` selected\n", + "_**if**_ `N1_METHOD == PRES` then `PRES` must exist in config.json file _and_ in routine data
\n", + "_**else**_ N1 will use `SUSP-TEST` instead" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96a7025e-083b-464d-8498-f7fdff493293", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check that col `PRES` exists in both config file and routine data\n", + "if (N1_METHOD == \"PRES\") {\n", + " pres_in_routine <- any(names(dhis2_routine) == \"PRES\")\n", + " pres_in_config <- any(DHIS2_INDICATORS == \"PRES\")\n", + "\n", + " if (!pres_in_routine) {\n", + " log_msg(\"🛑 Column `PRES` missing from routine data! 🚨 N1 calculations will use `SUSP-TEST` instead!\", \"error\")\n", + " stop()\n", + " }\n", + " if (!pres_in_config) {\n", + " log_msg(\"⚙️ Note: `PRES` set as parameter in this pipeline, but not defined as indicator in the configuration file (SNT_config.json)\", \"error\")\n", + " stop()\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "1c5e84cf", + "metadata": {}, + "source": [ + "#### 👥 Population Disaggregation logic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "458e3d78-3552-4447-93f8-6812a5d655be", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "INDICATORS_FOUND <- FALSE # 👈 \n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && !is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c(\"SUSP-TEST\", \"PRES\")) {\n", + "if (!is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c(\"SUSP-TEST\", \"PRES\")) {\n", + "\n", + " # Determine the dynamic prefix based on the method\n", + " prefix_method <- ifelse(N1_METHOD == \"SUSP-TEST\", \"SUSP\", \"PRES\")\n", + " prefix_all <- c(prefix_method, \"TEST\", \"CONF\") \n", + " # Define the expected column names \n", + " # (also make available for the 'else' warning message if the check fails)\n", + " target_colnames <- glue(\"{prefix_all}_{DISAGGREGATION_SELECTION}\")\n", + " \n", + " if (all(target_colnames %in% colnames(dhis2_routine))) {\n", + " \n", + " # We map the specific columns (e.g., SUSP_UNDER5) to generic names (e.g., SUSP)\n", + " dhis2_routine[prefix_all] <- dhis2_routine[target_colnames]\n", + " \n", + " for (col in target_colnames) {\n", + " log_msg(glue(\"Population Disaggregation: Successfully mapped indicator: {col}\"))\n", + " }\n", + " \n", + " # Signal success for the next code block\n", + " INDICATORS_FOUND <- TRUE\n", + " \n", + " } else {\n", + " missing_cols <- setdiff(target_colnames, colnames(dhis2_routine))\n", + " log_msg(glue(\"Population Disaggregation: Disaggregation on '{DISAGGREGATION_SELECTION}' failed.\"), \"warning\")\n", + " log_msg(glue(\"Population Disaggregation: Missing columns in routine dataset: {paste(missing_cols, collapse = ', ')}\"), \"warning\")\n", + " \n", + " msg <- glue(\"[ERROR] 🛑 Population Disaggregation: Required columns for disaggregation '{DISAGGREGATION_SELECTION}' are missing.\") \n", + " stop(msg)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "4473e75e-94d2-4f24-b6eb-38a7685542ad", + "metadata": {}, + "source": [ + "### 2.2. Load population data at level ADM2 x YEAR\n", + "\n", + "Already formatted & aggregated. \n", + "\n", + "**Expecting** table with these **cols** (bold = **must have**): \n", + "* ADM1_ID\n", + "* **ADM2_ID**\n", + "* **YEAR**\n", + "* **POPULATION** (pop at ADM2 level)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ce922c7-6dab-44cf-a94f-8a03d1f816a1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Select population file \n", + "if (USE_ADJUSTED_POPULATION) {\n", + " dhis2_pop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_POPULATION_TRANSFORMATION\n", + "} else {\n", + " dhis2_pop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "}\n", + " \n", + "# Load file from dataset\n", + "dhis2_population_adm2 <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_pop_dataset, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"[ERROR] Error while loading DHIS2 population file for: \" , COUNTRY_CODE, \n", + " \" [ERROR DETAILS] \", conditionMessage(e)) # log error message , \n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "log_msg(glue(\"DHIS2 population data loaded from dataset: {dhis2_pop_dataset}. Dataframe dimensions: {paste(dim(dhis2_population_adm2), collapse=', ')}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7163965", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhis2_population_adm2 |> head()" + ] + }, + { + "cell_type": "markdown", + "id": "6ae0c5fa", + "metadata": {}, + "source": [ + "#### 👥 Population Disaggregation logic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8620491", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (INDICATORS_FOUND) { \n", + " POPULATION_SELECTION <- paste0(\"POP_\", DISAGGREGATION_SELECTION) \n", + " if (!(POPULATION_SELECTION %in% colnames(dhis2_population_adm2))) {\n", + " log_msg(glue(\"Population Disaggregation: Column '{POPULATION_SELECTION}' not found in Population dataset.\"), \"warning\")\n", + " POPULATION_SELECTION <- \"POPULATION\"\n", + " }\n", + " # The selected column is assigned to POPULATION col so that later code can use it generically\n", + " dhis2_population_adm2$POPULATION <- dhis2_population_adm2[[POPULATION_SELECTION]]\n", + " log_msg(glue(\"Population Disaggregation: Column '{POPULATION_SELECTION}' selected as population values.\"))\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e596d0ed-56df-4756-83ed-717cfa72f643", + "metadata": {}, + "source": [ + "#### 2.2.1 **Population** data (DHIS2) columns selection.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5107756-f007-4c39-a4f6-b2ab0a653bd5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhis2_population_adm2 <- dhis2_population_adm2 %>% select(YEAR, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, POPULATION) \n", + "\n", + "dim(dhis2_population_adm2)\n", + "head(dhis2_population_adm2, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "b42a65ab-ad8d-41ba-9edb-dc2636f03a06", + "metadata": {}, + "source": [ + "### 2.3. (optional) **Care Seeking Behaviour** (CSB DHS) (taux recherche soins)\n", + "(20250728) Note: **changed units** (proportion to %), see https://bluesquare.atlassian.net/browse/SNT25-127 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a6a5338-9ffd-47d2-b92f-79deb7886078", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHS_INDICATORS\n", + "file_name <- glue::glue(\"{COUNTRY_CODE}_DHS_ADM1_PCT_CARESEEKING_SAMPLE_AVERAGE.parquet\")\n", + "\n", + "if (USE_CSB_DATA == TRUE) {\n", + " # Read the data, if error (cannot find at defined path) -> set careseeking_data to NULL (so it doesn't break the function at # 3.)\n", + " careseeking_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", + " error = function(e) {\n", + " msg <- paste(\"🛑 Error while loading DHS Care Seeking data file from `\", dataset_name, file_name ,\"`.\", conditionMessage(e)) # log error message\n", + " log_msg(msg, \"error\")\n", + " return(NULL) # make object NULL on error\n", + " })\n", + " \n", + " # Only print success messages and data info if careseeking_data is NOT NULL\n", + " if (!is.null(careseeking_data)) {\n", + " log_msg(paste0(\"Care Seeking data : \", file_name, \" loaded from dataset : \", dataset_name))\n", + " log_msg(paste0(\"Care Seeking data frame dimensions: \", nrow(careseeking_data), \" rows, \", ncol(careseeking_data), \" columns.\"))\n", + " head(careseeking_data)\n", + " } else {\n", + " log_msg(paste0(\"🚨 Care-seeking data not loaded due to an error, `careseeking_data` is set to `NULL`!\"), \"warning\")\n", + " }\n", + " \n", + "} else {\n", + " # if `USE_CSB_DATA == FALSE` ... (basically, ignore CSB data)\n", + " careseeking_data <- NULL\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "92723594-000b-41ee-82a1-8e69106a277d", + "metadata": {}, + "source": [ + "### 2.4. Load Reporting Rate \n", + "\n", + "Import Reporting Rate file based on what is available in the latest OH Dataset version (which depends on last run reporting rate pipepline).\n", + "\n", + "📅 **Important**: reporting rate must be **monthly**!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5722d64-ce61-4244-960e-57ebac28e4cf", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# function**\n", + "# Define dataset and file name (based on paramter)\n", + "rr_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", + "file_name_de <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", + "file_name_ds <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", + "\n", + "# Try loading dataelement reporting rates.\n", + "reporting_rate_month <- tryCatch({\n", + " df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_de)\n", + " log_msg(glue(\"Reporting Rate data: `{file_name_de}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df), collapse=', ')}\"))\n", + " REPORTING_RATE_METHOD <- \"dataelement\"\n", + " df_loaded\n", + "}, \n", + " error = function(e) { \n", + " cat(glue(\"[ERROR] Error while loading Reporting Rate 'dataelement' version for: {COUNTRY_CODE} {conditionMessage(e)}\"))\n", + " return(NULL)\n", + "})\n", + "\n", + "# Try loading dataset reporting rates.\n", + "if (is.null(reporting_rate_month)) {\n", + " reporting_rate_month <- tryCatch({\n", + " df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_ds) \n", + " log_msg(glue(\"Reporting Rate data: `{file_name_ds}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df), collapse=', ')}\"))\n", + " REPORTING_RATE_METHOD <- \"dataset\"\n", + " df_loaded\n", + " }, \n", + " error = function(e) { \n", + " stop(glue(\"[ERROR] Error while loading Reporting Rate 'dataset' version for: {COUNTRY_CODE} {conditionMessage(e)}\")) # raise error\n", + " })\n", + "}\n", + "\n", + "rm(df_loaded)\n", + "dim(reporting_rate_month)\n", + "head(reporting_rate_month, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "9d2529ad-8436-43c4-85b3-ed1ad9621e1e", + "metadata": {}, + "source": [ + "#### 🔍 Checkon data completeness for `REPORTING_RATE` data\n", + "Normally we should have \"complete\" data (no missing or `NA` values). However, when using certain datasets (from pipeline: \"Reporting Rate (Dataset)\") we might have incomplete coverage and hence `NA`s ...
\n", + "These are \"problematic\" because **N2** (Incidence adj 2) will also have `NA` values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eae2a67f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check on data completeness for REPORTING RATE data: \n", + "# check how many values of REPORTING_RATE are NA\n", + "na_count <- sum(is.na(reporting_rate_month$REPORTING_RATE)) \n", + "if (na_count > 0) {\n", + " log_msg(glue(\"⚠️ Warning: Reporting Rate data contains {na_count} missing values (NA) in 'REPORTING_RATE' column.\"), \"warning\")\n", + "} else {\n", + " log_msg(\"✅ Reporting Rate data contains no missing values (NA) in 'REPORTING_RATE' column.\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9cfa7211-5595-4ed6-9699-0f35aebcbc09", + "metadata": {}, + "source": [ + "### 2.5. Load Care seeking data (file) \n", + "\n", + "Load if available" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94ede0e7-e0a8-4e06-ad6c-485869b6d4a9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if the file exist, and try loading it..\n", + "if (file.exists(file.path(SNT_ROOT_PATH, \"uploads/care_seeking_ADM1.csv\"))) {\n", + " \n", + " care_seeking_data_f <- tryCatch({ read.csv(file.path(SNT_ROOT_PATH, \"uploads/care_seeking_ADM1.csv\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading Care Seeking data (NER)\", conditionMessage(e)) \n", + " log_msg(msg)\n", + " stop(msg) \n", + " })\n", + "\n", + " log_msg(\"Care seeking data file loaded: 'uploads/care_seeking_ADM1.csv' (NER Specific).\")\n", + " \n", + " # ensure numeric\n", + " care_seeking_data_f$PCT <- as.numeric(care_seeking_data_f$PCT) \n", + " \n", + "} else {\n", + " care_seeking_data_f <- NULL\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46abe114-0f56-48df-bae7-44147602027c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "care_seeking_data_f" + ] + }, + { + "cell_type": "markdown", + "id": "06f0ebcc-6b87-4d77-98ef-7b8d84be6a0a", + "metadata": {}, + "source": [ + "-------------------------------" + ] + }, + { + "cell_type": "markdown", + "id": "9943c1e5-4d95-4210-8b77-09c4085a96b8", + "metadata": {}, + "source": [ + "## 3. Calculate Incidence\n", + "First calculate monthly cases, then yearly incidence." + ] + }, + { + "cell_type": "markdown", + "id": "8769a974-de8a-4a1f-8f74-edb318a28060", + "metadata": {}, + "source": [ + "### 3.1 **Monthly cases**\n", + "\n", + "\n", + "These methods follow the standard WHO approach for estimating malaria incidence from routine health information systems (WHO, 2023).\n", + "As shown in the code, we begin by calculating **monthly malaria case metrics** (confirmed, tested, presumed) at the **ADM2** level and join them with the **monthly reporting rate**. \n", + "\n", + "This allows us to compute the **test positivity rate** (TPR, where `TPR` = `CONF` / `TEST`) and adjust for incomplete testing using the formula: \n", + "> **N1** = `CONF` + (`PRES` × `CONF` / `TEST`)\n", + "\n", + "Which is equivalent to:\n", + "> **N1** = `CONF` + (`PRES` × **TPR**)\n", + "\n", + "where:\n", + "- **N1** = cases adjusted for testing gaps \n", + "- `CONF` = **confirmed** cases\n", + "- `PRES` = **presumed** cases (either `SUSP` - `TEST` or directly available as `PRES`) 👈 this is a parameter (`N1_METHOD`)\n", + "- `TEST` = **tested** cases \n", + "- **TPR** = Test Positivity Rate (`CONF` / `TEST`)\n", + " \n", + "This produces `N1`, the number of cases adjusted for testing gaps, calculated at the monthly level in line with WHO recommendations to capture intra-annual variation.\n", + "\n", + "Next, we adjust for incomplete reporting using: \n", + "> **N2** = **N1** / `REPORTING_RATE`\n", + "\n", + "where `REPORTING_RATE` is at the monthly levele, and is the ratio of received reports (submission to DHIS2) divided by the expected reports.\n", + "\n", + "Finally, _if_ **careseeking** data is **available**, N3 is calculated as follows:\n", + "> **N3** = N2 + (N2 * PROP_PRIV / PROP_PUBL) + (N2 * NO_TREAT / PROP_PUBL)\n", + "\n", + "where:\n", + "- PRIVATE_CARE = proportion of kids treated in the **private** sector\n", + "- PUBLIC_CARE = proportion of kids treated in the **public** sector\n", + "- NO_CARE = proportion of kids which **did not receive any treatment**\n", + "\n", + "Note that this assumes the same TPR across all sectors (private and public).\n", + "\n", + "\n", + "\n", + "**Important note**
\n", + "In case reporting rate equals zero (none of the health facilities reported in a given month), N2 is set to `NA`. Note that the annual N2 will be underestimated, which is preferable compared to having `Inf` values.\n", + "\n", + "-------------" + ] + }, + { + "cell_type": "markdown", + "id": "dcee32af-ae6d-4b2a-9c7a-f846209f1dc3", + "metadata": {}, + "source": [ + "This calculation expects (input):\n", + "* **routine_data**: DHIS2 routine data, formatted and aggregated at ADM2 and MONTH level. Tibble (df) _must_ contain the following cols: `YEAR`, `MONTH`, `ADM2`, `CONF`, `TEST`, `SUSP`, `PRES`. \n", + "* **reporting_rate_data**: reporting rate calculated at ADM2 and MONTH level and expressed as proprtion **(0-1)**. Tibble (df) _must_ contain the following cols: `ADM2`, `YEAR`, `MONTH`, `reporting_rate`\n", + "\n", + "The calculation produces (output):\n", + "* data frame with the following cols: `ADM2`, `YEAR`, `MONTH`, \"value_\" * (`CONF`, `TEST`, `SUSP`, `PRES`), `TPR`, `N1`, `N2`\n", + "\n", + "-----------------" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1a0899a-3308-4d90-b06e-8a0cd4b849e1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Ensure correct data type for numerical columns ---------------------------------------\n", + "routine_data <- dhis2_routine %>%\n", + " mutate(across(any_of(c(\"YEAR\", \"MONTH\", \"CONF\", \"TEST\", \"SUSP\", \"PRES\")), as.numeric))\n", + "\n", + "reporting_rate_data <- reporting_rate_month %>% # reporting_rate_data\n", + " mutate(across(c(YEAR, MONTH, REPORTING_RATE), as.numeric))" + ] + }, + { + "cell_type": "markdown", + "id": "736dec8f", + "metadata": {}, + "source": [ + "#### 3.1.0. Aggregate at `ADM2` x `MONTH` & calculate **TPR**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8899964a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check for TEST > SUSP\n", + "routine_data |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58bf219e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Group & compute TPR\n", + "monthly_cases <- routine_data %>%\n", + " group_by(ADM1_ID, ADM2_ID, YEAR, MONTH) %>% # ADM1 needed to join careseeking data\n", + " summarise(\n", + " CONF = sum(CONF, na.rm = TRUE),\n", + " TEST = sum(TEST, na.rm = TRUE),\n", + " SUSP = sum(SUSP, na.rm = TRUE),\n", + " across(any_of(\"PRES\"), ~sum(., na.rm = TRUE), .names = \"PRES\"), # <- handles missing 'PRES' column gracefully\n", + " .groups = \"drop\") %>%\n", + " # Cleaning TEST data for \"SUSP-TEST\" method\n", + " mutate(TEST = ifelse(N1_METHOD == \"SUSP-TEST\" & !is.na(SUSP) & (TEST > SUSP), SUSP, TEST)) %>%\n", + " left_join(reporting_rate_data,\n", + " by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>% \n", + " # Calculate TPR based on CONF and TEST\n", + " # Note: if TEST is 0 or NA, set TPR = 1 (to avoid division by zero which produces Inf)\n", + " mutate( \n", + " TPR = ifelse(!is.na(CONF) & !is.na(TEST) & (TEST != 0), CONF / TEST, 1)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938f0194", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check for TEST > SUSP\n", + "monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() " + ] + }, + { + "cell_type": "markdown", + "id": "df43d6d8", + "metadata": {}, + "source": [ + "#### 3.1.1. Calculate **N1**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ead591bb-3936-486d-bb9e-7b01d0805d0d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate N1 based on `N1_METHOD` & availability of `PRES` \n", + "\n", + "if (N1_METHOD == \"SUSP-TEST\") {\n", + " monthly_cases <- monthly_cases %>%\n", + " mutate(N1 = CONF + ((SUSP - TEST) * TPR))\n", + " log_msg(\"Calculating N1 as `N1 = CONF + ((SUSP - TEST) * TPR)`\")\n", + "} else if (N1_METHOD == \"PRES\") {\n", + " # if: column named \"PRES\" exists in `monthly_cases` and contains at least one non-missing value\n", + " if (\"PRES\" %in% names(monthly_cases) && !all(is.na(monthly_cases$PRES))) {\n", + " monthly_cases <- monthly_cases %>%\n", + " mutate(N1 = CONF + (PRES * TPR))\n", + " log_msg(\"ℹ️ Calculating N1 as `N1 = CONF + (PRES * TPR)`\")\n", + " } else {\n", + " log_msg(\"🚨 Warning: 'PRES' not found in routine data or contains all `NA` values! 🚨 Calculating N1 using 'SUSP-TEST' method instead.\")\n", + " monthly_cases <- monthly_cases %>%\n", + " mutate(N1 = CONF + ((SUSP - TEST) * TPR))\n", + " }\n", + "} else {\n", + " log_msg(\"Invalid N1_METHOD. Please use 'PRES' or 'SUSP-TEST'.\") # not really necessary ... \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9543d283", + "metadata": {}, + "source": [ + "#### 3.1.2. Calculate **N2**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5060017a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate N2\n", + "monthly_cases <- monthly_cases %>%\n", + " mutate(\n", + " N2 = ifelse(REPORTING_RATE == 0, NA_real_, N1 / REPORTING_RATE) # On the fly convert `RR == 0` to NA to avoid N2 == Inf\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "debb1745-5066-4126-8a15-853b21ee8776", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Log msg about zero REPORTING RATE cases and warn that N2 set to NA\n", + "\n", + "zero_reporting <- reporting_rate_data %>%\n", + " filter(REPORTING_RATE == 0) %>%\n", + " summarise(\n", + " n_months_zero_reporting = n(),\n", + " affected_zones = n_distinct(ADM2_ID)\n", + " )\n", + "\n", + "if (zero_reporting$n_months_zero_reporting > 0) { \n", + " log_msg(glue(\"🚨 Note: {zero_reporting$n_months_zero_reporting} rows had `REPORTING_RATE == 0` across \",\n", + " \"{zero_reporting$affected_zones} ADM2. These N2 values were set to NA.\"))\n", + "} else {\n", + " log_msg(\"✅ Note: no ADM2 has `REPORTING_RATE == 0`. All N2 values were preserved.\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "053f854f-ee8c-40a3-abd6-82d69cc7dca4", + "metadata": {}, + "source": [ + "#### 3.1.3. (optional) Compute **N3** with adjusted **N2** by 'care seeking data file' (csv)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a306e23-93b7-4b8d-84ec-e4afe413a613", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!is.null(care_seeking_data_f)) {\n", + " monthly_cases <- monthly_cases %>%\n", + " left_join(., care_seeking_data_f %>% select(ADM1_ID, PCT), by = c(\"ADM1_ID\")) %>%\n", + " mutate(\n", + " N3 = N2 / PCT\n", + " ) %>% \n", + " select(-PCT)\n", + " log_msg(\"N2 adjusted by care seeking data (NER Specific).\")\n", + " # head(monthly_cases)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "dd9b5677", + "metadata": {}, + "source": [ + "#### 3.1.4. (optional) Calculate **N3**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7aa926ed-99ea-474c-988e-8151d6b12002", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Only calculate N3 if CARESEEKING data is avaiable \n", + "if (!is.null(careseeking_data)) {\n", + " monthly_cases <- monthly_cases %>%\n", + " mutate(YEAR = as.numeric(YEAR)) %>% # keep as safety\n", + " left_join(., careseeking_data, by = c(\"ADM1_ID\")) %>%\n", + " mutate(\n", + " N3 = N2 + (N2 * PCT_PRIVATE_CARE / PCT_PUBLIC_CARE) + (N2 * PCT_NO_CARE / PCT_PUBLIC_CARE) \n", + " )\n", + "} else {\n", + " print(\"🦘 Careseeking data not available, skipping calculation of N3.\")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a67ddc0e-40ea-41d7-9fef-5e05d9594956", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(monthly_cases, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "fb4214ba", + "metadata": {}, + "source": [ + "#### 💾 Export `monthly_cases` (for 📓report notebook)\n", + "For coherence checks, which need monthly resolution ... !" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d80a7cb6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Save monthly_cases as .parquet file \n", + "file_path <- file.path(INTERMEDIATE_DATA_PATH, paste0(COUNTRY_CODE, \"_monthly_cases.parquet\"))\n", + "arrow::write_parquet(monthly_cases, file_path)\n", + "\n", + "# Log msg\n", + "log_msg(glue(\"Monthly cases data saved to: {file_path}\"))\n", + "head(monthly_cases)" + ] + }, + { + "cell_type": "markdown", + "id": "7b50302e-20af-4fa6-8e8c-1e3a6c763ea2", + "metadata": {}, + "source": [ + "### 🔍 Data **coherence** checks on **monthly cases**\n", + "Check for ratios or differences that will cause negative values -> which will causes adjusted incidence to be lower than the values it adjust\n", + "\n", + "\n", + "Namely, the following relationships among INDICATORs:\n", + "* SUSP-TEST\n", + "* CONF/TEST\n", + "* N1 == CONF ... (when PRES == 0)" + ] + }, + { + "cell_type": "markdown", + "id": "e9f7ae73-46f6-4c78-9bb2-fdcfbd591b10", + "metadata": {}, + "source": [ + "#### 1. `PRES == 0`: causes `N1 == CONF` \n", + "(if `N1_METHOD == \"PRES\"`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "495fe18a-50ad-4eff-8669-135c63a7c8dd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Run this check only if N1_METHOD == \"PRES\" (else, problem doesn't exist)\n", + "if (N1_METHOD == \"PRES\") {\n", + " nr_of_pres_0_adm2_month <- monthly_cases |> filter(PRES == 0) |> nrow()\n", + " log_msg(glue(\"🚨 Note: using `PRES` for incidence adjustement, but `PRES == 0` for {nr_of_pres_0_adm2_month} rows (ADM2 x MONTH).\"), \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e12e744b-540e-462c-a16e-edbb05ddc047", + "metadata": {}, + "source": [ + "#### 2. `SUSP-TEST`: if negative, then N1 smaller or equal to CONF (ADJ =< CRUDE)\n", + "(if `N1_METHOD == \"SUSP-TEST\"`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49de98f1-2424-440e-922f-72d7702dd894", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# SUSP - TEST: if negative (TEST > SUSP), then N1 smaller or equal to CONF, which then causes ADJ ≤ CRUDE\n", + "if (N1_METHOD == \"SUSP-TEST\") {\n", + " nr_of_negative <- monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() \n", + " if (nr_of_negative > 0) {\n", + " log_msg(\n", + " glue(\"🚨 Note: using formula `SUSP - TEST` for incidence adjustement, but higher tested than suspected cases (`SUSP < TEST`) detected in {nr_of_negative} rows (ADM2 x MONTH).\"),\n", + " \"warning\"\n", + " )\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "d72d7545-9afa-45c6-9efd-2619aecfc794", + "metadata": {}, + "source": [ + "#### 3. `CONF/TEST` = `TPR` (to calculate N1: Incidence adjusted for **Testing**)\n", + "This **ratio should** always be **≤ 1** because **there should _not_ be more confirmed cases than tested** ...\n", + "\n", + "(but if very small, then N1 could be smaller or equal to CONF (so ADJ INC ≤ CRUDE))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cc60295-5046-4932-b332-965fd320f72e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "more_confirmed_than_tested <- monthly_cases |> mutate(CONF_divby_TEST = CONF / TEST) |> filter(CONF_divby_TEST > 1) |> nrow() \n", + "\n", + "if (more_confirmed_than_tested > 0) {\n", + " log_msg(glue(\"🚨 Note: higher confirmed than tested cases (`CONF/TEST`) detected in {more_confirmed_than_tested} rows (ADM2 x MONTH).\"), \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "acbabb99-07ce-4054-a702-2d3cd59c328e", + "metadata": {}, + "source": [ + "### 3.2 **Yearly incidence**\n", + "After calculating N1 and N2 for each `ADM2`-`MONTH`, we aggregate the data annually to compute the yearly totals (sums) for crude cases (`CONF`), `N1` and `N2`. Finally, we compute:\n", + "* Crude incidence: C / POP × 1000\n", + "* Incidence adjusted for testing: N1 / POP × 1000\n", + "* Incidence adjusted for testing and reporting: N2 / POP × 1000\n", + "* Incidence adjusted for testing, reporting and careseeking behaviour (optional): N3 / POP × 1000\n", + "\n", + "--------------" + ] + }, + { + "cell_type": "markdown", + "id": "d47a3908-71cb-4e79-8771-f6caceae4ce2", + "metadata": {}, + "source": [ + "The calculation expects (input):\n", + "* **monthly_cases**: as the output of `calculate_monthly_cases()`, or a tibble/data frame with the following cols: `ADM2`, `YEAR`, `MONTH`, \"value_\" * (CONF, TEST, SUSP, PRES), `TPR`, `N1`, `N2` \n", + "* **population_data**: df of population data formatted and aligned, aggregated at ADM2 and YEAR level. A tibble/data frame that _must_ contain the following cols: `ADM2`, `YEAR`, `POPULATION`\n", + "\n", + "The calculation produces (output): \n", + "* a data frame with the following cols: ADM2_ID, YEAR, CONF, N1, N2, `INCIDENCE_CRUDE`, `INCIDENCE_ADJ_TESTING`, `INCIDENCE_ADJ_REPORTING`\n", + "\n", + "--------------------" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8753721-067b-4da3-8305-8d98f823454f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ---- 1. Enforce column types upfront ----\n", + "monthly_cases <- monthly_cases %>% \n", + " mutate(across(where(is.numeric), as.numeric)) # Convert all numeric columns\n", + " \n", + "population_data <- dhis2_population_adm2 %>% # population_data\n", + " mutate(across(c(YEAR, POPULATION), as.numeric))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4a9ea81-4f7c-4505-8847-07de13831a42", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ---- 2. Core calculation ----\n", + "yearly_incidence <- monthly_cases %>%\n", + " group_by(ADM2_ID, YEAR) %>%\n", + " summarise(\n", + " # 🚨 removed `na.rm = TRUE` on 20250702 - if things break check here! 🚨 \n", + " across(c(CONF, N1, N2), ~sum(.)), #, na.rm = TRUE)), # 🔍 PROBLEM: if NA's in N2 (due to missing RR data), the sum of N2 by YEAR is smaller than the sum of N1 !\n", + " # across(any_of(c(\"CONF\", \"TEST\", \"SUSP\", \"PRES\", \"N1\", \"N2\")), ~sum(.)), # silenced as not necessary to also summarize \"TEST\", \"SUSP\", \"PRES\"\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " left_join(\n", + " population_data,\n", + " by = c(\"ADM2_ID\", \"YEAR\")\n", + " ) %>%\n", + " mutate(\n", + " INCIDENCE_CRUDE = CONF / POPULATION * 1000,\n", + " INCIDENCE_ADJ_TESTING = N1 / POPULATION * 1000,\n", + " INCIDENCE_ADJ_REPORTING = N2 / POPULATION * 1000\n", + " ) |>\n", + " ungroup()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c712f4c2-d677-4d22-8298-111aa0a93034", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ---- 3.1 Optional careseeking data CSV adjustment ---- \n", + "if (!is.null(care_seeking_data_f) && \"N3\" %in% names(monthly_cases)) {\n", + " n3_data <- monthly_cases %>%\n", + " group_by(ADM2_ID, YEAR) %>%\n", + " summarise(N3 = sum(N3, na.rm = TRUE),\n", + " .groups = \"drop\") |>\n", + " ungroup()\n", + " \n", + " yearly_incidence <- yearly_incidence %>%\n", + " left_join(n3_data, by = c(\"ADM2_ID\", \"YEAR\")) %>%\n", + " mutate(\n", + " INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000\n", + " )\n", + " } else {\n", + " yearly_incidence <- yearly_incidence |>\n", + " mutate(\n", + " INCIDENCE_ADJ_CARESEEKING = NA\n", + " )\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7602a82b-8829-4613-8961-61c419073269", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ---- 3.2 Optional careseeking adjustment ----\n", + "if (is.null(care_seeking_data_f)) { # quick fix\n", + " \n", + " if (!is.null(careseeking_data) && \"N3\" %in% names(monthly_cases)) {\n", + " n3_data <- monthly_cases %>%\n", + " group_by(ADM2_ID, YEAR) %>%\n", + " summarise(N3 = sum(N3, na.rm = TRUE),\n", + " .groups = \"drop\") |>\n", + " ungroup()\n", + " \n", + " yearly_incidence <- yearly_incidence %>%\n", + " left_join(n3_data, by = c(\"ADM2_ID\", \"YEAR\")) %>%\n", + " mutate(\n", + " INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000\n", + " )\n", + " } else {\n", + " yearly_incidence <- yearly_incidence |>\n", + " mutate(\n", + " INCIDENCE_ADJ_CARESEEKING = NA\n", + " )\n", + " }\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56001b15-f74e-42d9-bfa2-bd5563b6a512", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(yearly_incidence, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "7976f894-daf4-46c3-9fa1-5303cbba0818", + "metadata": {}, + "source": [ + "### 🔍 Data **coherence** checks on **yearly incidence**\n", + "Here we check if values of Indicidence (already at `YEAR` resolution) make sense in relation to each other.
\n", + "Namely:\n", + "* crude values should be the lowest, and any consecutive **adjustment** should cause the incidence values to **increase** or remain the **same** - but should never be lower!" + ] + }, + { + "cell_type": "markdown", + "id": "d3dfac34-86f5-4f8c-add9-f54485259924", + "metadata": {}, + "source": [ + "#### 1. `INCIDENCE_ADJ_TESTING` (adj. level 1) should always be greater than `INCIDENCE_CRUDE` (not adjusted)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acb03778-f1db-4f28-9b09-3cd8d815f976", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# same as below but different cols ... \n", + "# Count TRUE values, handling potential NAs in the result of if_else\n", + "nr_of_impossible_values <- yearly_incidence |>\n", + " mutate(IMPOSSIBLE_VALUE = if_else(INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE, TRUE, FALSE)) |>\n", + " pull(IMPOSSIBLE_VALUE) |>\n", + " sum(na.rm = TRUE) \n", + "\n", + "# Warning if any impossible values are found\n", + "if (nr_of_impossible_values > 0) {\n", + " log_msg(glue::glue(\"🚨 Warning: found {nr_of_impossible_values} rows where INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE!\"), \"warning\")\n", + "} else log_msg(\"✅ For all YEAR and ADM2, `INCIDENCE_CRUDE` is smaller than `INCIDENCE_ADJ_TESTING` (as expected).\")\n", + "\n", + "# Check if all values in a column are NA\n", + "if (all(is.na(yearly_incidence$INCIDENCE_ADJ_TESTING))) {\n", + " log_msg(\"🚨 Warning: all values of `INCIDENCE_ADJ_TESTING` are `NA`s\", \"warning\")\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "827d1e84-7f43-404c-88cc-9b675bfa48a1", + "metadata": {}, + "source": [ + "#### 2. `INCIDENCE_ADJ_REPORTING` (adj. level 2) should always be greater than `INCIDENCE_ADJ_TESTING` (adj. level 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b1da976-b157-4e94-b5e9-8795e87bb416", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Count TRUE values, handling potential NAs in the result of if_else\n", + "nr_of_impossible_values <- yearly_incidence |>\n", + " mutate(IMPOSSIBLE_VALUE = if_else(INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING, TRUE, FALSE)) |>\n", + " pull(IMPOSSIBLE_VALUE) |>\n", + " sum(na.rm = TRUE) \n", + "\n", + "# Warning if any impossible values are found\n", + "if (nr_of_impossible_values > 0) {\n", + " log_msg(glue::glue(\"🚨 Warning: found {nr_of_impossible_values} rows where INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING!\"), \"warning\")\n", + "} else log_msg(\"✅ For all YEAR and ADM2, `INCIDENCE_ADJ_TESTING` is smaller than `INCIDENCE_ADJ_REPORTING` (as expected).\")\n", + "\n", + "# Check if all values in a column are NA\n", + "if (all(is.na(yearly_incidence$INCIDENCE_ADJ_REPORTING))) {\n", + " log_msg(\"🚨 Warning: all values of `INCIDENCE_ADJ_REPORTING` are `NA`s\", \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3e57f2e8-1ccc-417c-9fa6-e6b1976336bc", + "metadata": {}, + "source": [ + "## 4. Export to `/data/dhis2_incidence/` folder" + ] + }, + { + "cell_type": "markdown", + "id": "5b6861bb", + "metadata": {}, + "source": [ + "### 4.0. Keep only essential cols \n", + "Based on [SNT Pipelines Data glossary](https://docs.google.com/spreadsheets/d/1qvZMsmCWU6cVLgGZTEXsd5xmoecIxb4LAd-g_2qzYdw/edit?usp=sharing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4085515", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "yearly_incidence <- yearly_incidence |>\n", + "select(\n", + " YEAR, \n", + " starts_with(\"ADM\"),\n", + " starts_with(\"POPULATION\"),\n", + " starts_with(\"INCIDENCE\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e432998-bf85-4706-bea4-8684b0b58c16", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Reusable function to generate filename and save data\n", + "save_yearly_incidence <- function(yearly_incidence, data_path, file_extension, write_function) {\n", + " \n", + " base_name_parts <- c(COUNTRY_CODE, \"_incidence\")\n", + " \n", + " # Concatenate all parts to form the final filename\n", + " file_name <- paste0(c(base_name_parts, file_extension), collapse = \"\")\n", + " # file_path <- file.path(data_path, \"incidence\", file_name)\n", + " file_path <- file.path(data_path, file_name)\n", + " output_dir <- dirname(file_path)\n", + "\n", + " # Check if the output directory exists, else create it\n", + " if (!dir.exists(output_dir)) {\n", + " dir.create(output_dir, recursive = TRUE)\n", + " }\n", + "\n", + " # Flexibility to use function as provided in argument: \"write_csv\" or \"arrow::write_parquet\" ... \n", + " write_function(yearly_incidence, file_path)\n", + "\n", + " log_msg(paste0(\"Exporting : \", file_path))\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3ed49106-b335-42c3-9511-ffd864dd50f0", + "metadata": {}, + "source": [ + "#### 👥 Population Disaggregation logic \n", + "\n", + "Provide a msg to the user to indicate that the results correspond to a specific version of indicators and population (under5, pregnant or totals)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f15c0c4-74d9-4280-ba46-8add435a9147", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# if (COUNTRY_CODE == \"NER\" & INDICATORS_FOUND) {\n", + "if (INDICATORS_FOUND) {\n", + " log_msg(glue(\"ℹ️ The results have been computed using the following Indicators: {paste(target_colnames, collapse=', ')}\"))\n", + " log_msg(glue(\"ℹ️ The results have been computed using the following Population: {POPULATION_SELECTION}\"))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16e7d83b-3962-4041-9d2d-aaa362b62d5f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Export the data\n", + "\n", + "# CSV\n", + "save_yearly_incidence(yearly_incidence, DATA_PATH, \".csv\", write_csv)\n", + "\n", + "# Parquet\n", + "save_yearly_incidence(yearly_incidence, DATA_PATH, \".parquet\", arrow::write_parquet)" + ] } - }, - "outputs": [], - "source": [ - "# function**\n", - "# Define dataset and file name (based on paramter)\n", - "rr_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "file_name_de <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", - "file_name_ds <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", - "\n", - "# Try loading dataelement reporting rates.\n", - "reporting_rate_month <- tryCatch({\n", - " df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_de)\n", - " log_msg(glue(\"Reporting Rate data: `{file_name_de}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df), collapse=', ')}\"))\n", - " REPORTING_RATE_METHOD <- \"dataelement\"\n", - " df_loaded\n", - "}, \n", - " error = function(e) { \n", - " cat(glue(\"[ERROR] Error while loading Reporting Rate 'dataelement' version for: {COUNTRY_CODE} {conditionMessage(e)}\"))\n", - " return(NULL)\n", - "})\n", - "\n", - "# Try loading dataset reporting rates.\n", - "if (is.null(reporting_rate_month)) {\n", - " reporting_rate_month <- tryCatch({\n", - " df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_ds) \n", - " log_msg(glue(\"Reporting Rate data: `{file_name_ds}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df), collapse=', ')}\"))\n", - " REPORTING_RATE_METHOD <- \"dataset\"\n", - " df_loaded\n", - " }, \n", - " error = function(e) { \n", - " stop(glue(\"[ERROR] Error while loading Reporting Rate 'dataset' version for: {COUNTRY_CODE} {conditionMessage(e)}\")) # raise error\n", - " })\n", - "}\n", - "\n", - "rm(df_loaded)\n", - "dim(reporting_rate_month)\n", - "head(reporting_rate_month, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "9d2529ad-8436-43c4-85b3-ed1ad9621e1e", - "metadata": {}, - "source": [ - "#### 🔍 Checkon data completeness for `REPORTING_RATE` data\n", - "Normally we should have \"complete\" data (no missing or `NA` values). However, when using certain datasets (from pipeline: \"Reporting Rate (Dataset)\") we might have incomplete coverage and hence `NA`s ...
\n", - "These are \"problematic\" because **N2** (Incidence adj 2) will also have `NA` values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eae2a67f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check on data completeness for REPORTING RATE data: \n", - "# check how many values of REPORTING_RATE are NA\n", - "na_count <- sum(is.na(reporting_rate_month$REPORTING_RATE)) \n", - "if (na_count > 0) {\n", - " log_msg(glue(\"⚠️ Warning: Reporting Rate data contains {na_count} missing values (NA) in 'REPORTING_RATE' column.\"), \"warning\")\n", - "} else {\n", - " log_msg(\"✅ Reporting Rate data contains no missing values (NA) in 'REPORTING_RATE' column.\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9cfa7211-5595-4ed6-9699-0f35aebcbc09", - "metadata": {}, - "source": [ - "### 2.5. Load Care seeking data (file) \n", - "\n", - "Load if available" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "94ede0e7-e0a8-4e06-ad6c-485869b6d4a9", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if the file exist, and try loading it..\n", - "if (file.exists(file.path(SNT_ROOT_PATH, \"uploads/care_seeking_ADM1.csv\"))) {\n", - " \n", - " care_seeking_data_f <- tryCatch({ read.csv(file.path(SNT_ROOT_PATH, \"uploads/care_seeking_ADM1.csv\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading Care Seeking data (NER)\", conditionMessage(e)) \n", - " log_msg(msg)\n", - " stop(msg) \n", - " })\n", - "\n", - " log_msg(\"Care seeking data file loaded: 'uploads/care_seeking_ADM1.csv' (NER Specific).\")\n", - " \n", - " # ensure numeric\n", - " care_seeking_data_f$PCT <- as.numeric(care_seeking_data_f$PCT) \n", - " \n", - "} else {\n", - " care_seeking_data_f <- NULL\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46abe114-0f56-48df-bae7-44147602027c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "care_seeking_data_f" - ] - }, - { - "cell_type": "markdown", - "id": "06f0ebcc-6b87-4d77-98ef-7b8d84be6a0a", - "metadata": {}, - "source": [ - "-------------------------------" - ] - }, - { - "cell_type": "markdown", - "id": "9943c1e5-4d95-4210-8b77-09c4085a96b8", - "metadata": {}, - "source": [ - "## 3. Calculate Incidence\n", - "First calculate monthly cases, then yearly incidence." - ] - }, - { - "cell_type": "markdown", - "id": "8769a974-de8a-4a1f-8f74-edb318a28060", - "metadata": {}, - "source": [ - "### 3.1 **Monthly cases**\n", - "\n", - "\n", - "These methods follow the standard WHO approach for estimating malaria incidence from routine health information systems (WHO, 2023).\n", - "As shown in the code, we begin by calculating **monthly malaria case metrics** (confirmed, tested, presumed) at the **ADM2** level and join them with the **monthly reporting rate**. \n", - "\n", - "This allows us to compute the **test positivity rate** (TPR, where `TPR` = `CONF` / `TEST`) and adjust for incomplete testing using the formula: \n", - "> **N1** = `CONF` + (`PRES` × `CONF` / `TEST`)\n", - "\n", - "Which is equivalent to:\n", - "> **N1** = `CONF` + (`PRES` × **TPR**)\n", - "\n", - "where:\n", - "- **N1** = cases adjusted for testing gaps \n", - "- `CONF` = **confirmed** cases\n", - "- `PRES` = **presumed** cases (either `SUSP` - `TEST` or directly available as `PRES`) 👈 this is a parameter (`N1_METHOD`)\n", - "- `TEST` = **tested** cases \n", - "- **TPR** = Test Positivity Rate (`CONF` / `TEST`)\n", - " \n", - "This produces `N1`, the number of cases adjusted for testing gaps, calculated at the monthly level in line with WHO recommendations to capture intra-annual variation.\n", - "\n", - "Next, we adjust for incomplete reporting using: \n", - "> **N2** = **N1** / `REPORTING_RATE`\n", - "\n", - "where `REPORTING_RATE` is at the monthly levele, and is the ratio of received reports (submission to DHIS2) divided by the expected reports.\n", - "\n", - "Finally, _if_ **careseeking** data is **available**, N3 is calculated as follows:\n", - "> **N3** = N2 + (N2 * PROP_PRIV / PROP_PUBL) + (N2 * NO_TREAT / PROP_PUBL)\n", - "\n", - "where:\n", - "- PRIVATE_CARE = proportion of kids treated in the **private** sector\n", - "- PUBLIC_CARE = proportion of kids treated in the **public** sector\n", - "- NO_CARE = proportion of kids which **did not receive any treatment**\n", - "\n", - "Note that this assumes the same TPR across all sectors (private and public).\n", - "\n", - "\n", - "\n", - "**Important note**
\n", - "In case reporting rate equals zero (none of the health facilities reported in a given month), N2 is set to `NA`. Note that the annual N2 will be underestimated, which is preferable compared to having `Inf` values.\n", - "\n", - "-------------" - ] - }, - { - "cell_type": "markdown", - "id": "dcee32af-ae6d-4b2a-9c7a-f846209f1dc3", - "metadata": {}, - "source": [ - "This calculation expects (input):\n", - "* **routine_data**: DHIS2 routine data, formatted and aggregated at ADM2 and MONTH level. Tibble (df) _must_ contain the following cols: `YEAR`, `MONTH`, `ADM2`, `CONF`, `TEST`, `SUSP`, `PRES`. \n", - "* **reporting_rate_data**: reporting rate calculated at ADM2 and MONTH level and expressed as proprtion **(0-1)**. Tibble (df) _must_ contain the following cols: `ADM2`, `YEAR`, `MONTH`, `reporting_rate`\n", - "\n", - "The calculation produces (output):\n", - "* data frame with the following cols: `ADM2`, `YEAR`, `MONTH`, \"value_\" * (`CONF`, `TEST`, `SUSP`, `PRES`), `TPR`, `N1`, `N2`\n", - "\n", - "-----------------" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1a0899a-3308-4d90-b06e-8a0cd4b849e1", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Ensure correct data type for numerical columns ---------------------------------------\n", - "routine_data <- dhis2_routine %>%\n", - " mutate(across(any_of(c(\"YEAR\", \"MONTH\", \"CONF\", \"TEST\", \"SUSP\", \"PRES\")), as.numeric))\n", - "\n", - "reporting_rate_data <- reporting_rate_month %>% # reporting_rate_data\n", - " mutate(across(c(YEAR, MONTH, REPORTING_RATE), as.numeric))" - ] - }, - { - "cell_type": "markdown", - "id": "736dec8f", - "metadata": {}, - "source": [ - "#### 3.1.0. Aggregate at `ADM2` x `MONTH` & calculate **TPR**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8899964a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check for TEST > SUSP\n", - "routine_data |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58bf219e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Group & compute TPR\n", - "monthly_cases <- routine_data %>%\n", - " group_by(ADM1_ID, ADM2_ID, YEAR, MONTH) %>% # ADM1 needed to join careseeking data\n", - " summarise(\n", - " CONF = sum(CONF, na.rm = TRUE),\n", - " TEST = sum(TEST, na.rm = TRUE),\n", - " SUSP = sum(SUSP, na.rm = TRUE),\n", - " across(any_of(\"PRES\"), ~sum(., na.rm = TRUE), .names = \"PRES\"), # <- handles missing 'PRES' column gracefully\n", - " .groups = \"drop\") %>%\n", - " # Cleaning TEST data for \"SUSP-TEST\" method\n", - " mutate(TEST = ifelse(N1_METHOD == \"SUSP-TEST\" & !is.na(SUSP) & (TEST > SUSP), SUSP, TEST)) %>%\n", - " left_join(reporting_rate_data,\n", - " by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>% \n", - " # Calculate TPR based on CONF and TEST\n", - " # Note: if TEST is 0 or NA, set TPR = 1 (to avoid division by zero which produces Inf)\n", - " mutate( \n", - " TPR = ifelse(!is.na(CONF) & !is.na(TEST) & (TEST != 0), CONF / TEST, 1)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "938f0194", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check for TEST > SUSP\n", - "monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() " - ] - }, - { - "cell_type": "markdown", - "id": "df43d6d8", - "metadata": {}, - "source": [ - "#### 3.1.1. Calculate **N1**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ead591bb-3936-486d-bb9e-7b01d0805d0d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Calculate N1 based on `N1_METHOD` & availability of `PRES` \n", - "\n", - "if (N1_METHOD == \"SUSP-TEST\") {\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(N1 = CONF + ((SUSP - TEST) * TPR))\n", - " log_msg(\"Calculating N1 as `N1 = CONF + ((SUSP - TEST) * TPR)`\")\n", - "} else if (N1_METHOD == \"PRES\") {\n", - " # if: column named \"PRES\" exists in `monthly_cases` and contains at least one non-missing value\n", - " if (\"PRES\" %in% names(monthly_cases) && !all(is.na(monthly_cases$PRES))) {\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(N1 = CONF + (PRES * TPR))\n", - " log_msg(\"ℹ️ Calculating N1 as `N1 = CONF + (PRES * TPR)`\")\n", - " } else {\n", - " log_msg(\"🚨 Warning: 'PRES' not found in routine data or contains all `NA` values! 🚨 Calculating N1 using 'SUSP-TEST' method instead.\")\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(N1 = CONF + ((SUSP - TEST) * TPR))\n", - " }\n", - "} else {\n", - " log_msg(\"Invalid N1_METHOD. Please use 'PRES' or 'SUSP-TEST'.\") # not really necessary ... \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9543d283", - "metadata": {}, - "source": [ - "#### 3.1.2. Calculate **N2**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5060017a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Calculate N2\n", - "monthly_cases <- monthly_cases %>%\n", - " mutate(\n", - " N2 = ifelse(REPORTING_RATE == 0, NA_real_, N1 / REPORTING_RATE) # On the fly convert `RR == 0` to NA to avoid N2 == Inf\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "debb1745-5066-4126-8a15-853b21ee8776", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Log msg about zero REPORTING RATE cases and warn that N2 set to NA\n", - "\n", - "zero_reporting <- reporting_rate_data %>%\n", - " filter(REPORTING_RATE == 0) %>%\n", - " summarise(\n", - " n_months_zero_reporting = n(),\n", - " affected_zones = n_distinct(ADM2_ID)\n", - " )\n", - "\n", - "if (zero_reporting$n_months_zero_reporting > 0) { \n", - " log_msg(glue(\"🚨 Note: {zero_reporting$n_months_zero_reporting} rows had `REPORTING_RATE == 0` across \",\n", - " \"{zero_reporting$affected_zones} ADM2. These N2 values were set to NA.\"))\n", - "} else {\n", - " log_msg(\"✅ Note: no ADM2 has `REPORTING_RATE == 0`. All N2 values were preserved.\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "053f854f-ee8c-40a3-abd6-82d69cc7dca4", - "metadata": {}, - "source": [ - "#### 3.1.3. (optional) Compute **N3** with adjusted **N2** by 'care seeking data file' (csv)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a306e23-93b7-4b8d-84ec-e4afe413a613", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!is.null(care_seeking_data_f)) {\n", - " monthly_cases <- monthly_cases %>%\n", - " left_join(., care_seeking_data_f %>% select(ADM1_ID, PCT), by = c(\"ADM1_ID\")) %>%\n", - " mutate(\n", - " N3 = N2 / PCT\n", - " ) %>% \n", - " select(-PCT)\n", - " log_msg(\"N2 adjusted by care seeking data (NER Specific).\")\n", - " # head(monthly_cases)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "dd9b5677", - "metadata": {}, - "source": [ - "#### 3.1.4. (optional) Calculate **N3**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7aa926ed-99ea-474c-988e-8151d6b12002", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Only calculate N3 if CARESEEKING data is avaiable \n", - "if (!is.null(careseeking_data)) {\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(YEAR = as.numeric(YEAR)) %>% # keep as safety\n", - " left_join(., careseeking_data, by = c(\"ADM1_ID\")) %>%\n", - " mutate(\n", - " N3 = N2 + (N2 * PCT_PRIVATE_CARE / PCT_PUBLIC_CARE) + (N2 * PCT_NO_CARE / PCT_PUBLIC_CARE) \n", - " )\n", - "} else {\n", - " print(\"🦘 Careseeking data not available, skipping calculation of N3.\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a67ddc0e-40ea-41d7-9fef-5e05d9594956", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(monthly_cases, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "fb4214ba", - "metadata": {}, - "source": [ - "#### 💾 Export `monthly_cases` (for 📓report notebook)\n", - "For coherence checks, which need monthly resolution ... !" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d80a7cb6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Save monthly_cases as .parquet file \n", - "file_path <- file.path(INTERMEDIATE_DATA_PATH, paste0(COUNTRY_CODE, \"_monthly_cases.parquet\"))\n", - "arrow::write_parquet(monthly_cases, file_path)\n", - "\n", - "# Log msg\n", - "log_msg(glue(\"Monthly cases data saved to: {file_path}\"))\n", - "head(monthly_cases)" - ] - }, - { - "cell_type": "markdown", - "id": "7b50302e-20af-4fa6-8e8c-1e3a6c763ea2", - "metadata": {}, - "source": [ - "### 🔍 Data **coherence** checks on **monthly cases**\n", - "Check for ratios or differences that will cause negative values -> which will causes adjusted incidence to be lower than the values it adjust\n", - "\n", - "\n", - "Namely, the following relationships among INDICATORs:\n", - "* SUSP-TEST\n", - "* CONF/TEST\n", - "* N1 == CONF ... (when PRES == 0)" - ] - }, - { - "cell_type": "markdown", - "id": "e9f7ae73-46f6-4c78-9bb2-fdcfbd591b10", - "metadata": {}, - "source": [ - "#### 1. `PRES == 0`: causes `N1 == CONF` \n", - "(if `N1_METHOD == \"PRES\"`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "495fe18a-50ad-4eff-8669-135c63a7c8dd", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Run this check only if N1_METHOD == \"PRES\" (else, problem doesn't exist)\n", - "if (N1_METHOD == \"PRES\") {\n", - " nr_of_pres_0_adm2_month <- monthly_cases |> filter(PRES == 0) |> nrow()\n", - " log_msg(glue(\"🚨 Note: using `PRES` for incidence adjustement, but `PRES == 0` for {nr_of_pres_0_adm2_month} rows (ADM2 x MONTH).\"), \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e12e744b-540e-462c-a16e-edbb05ddc047", - "metadata": {}, - "source": [ - "#### 2. `SUSP-TEST`: if negative, then N1 smaller or equal to CONF (ADJ =< CRUDE)\n", - "(if `N1_METHOD == \"SUSP-TEST\"`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49de98f1-2424-440e-922f-72d7702dd894", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# SUSP - TEST: if negative (TEST > SUSP), then N1 smaller or equal to CONF, which then causes ADJ ≤ CRUDE\n", - "if (N1_METHOD == \"SUSP-TEST\") {\n", - " nr_of_negative <- monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() \n", - " if (nr_of_negative > 0) {\n", - " log_msg(\n", - " glue(\"🚨 Note: using formula `SUSP - TEST` for incidence adjustement, but higher tested than suspected cases (`SUSP < TEST`) detected in {nr_of_negative} rows (ADM2 x MONTH).\"),\n", - " \"warning\"\n", - " )\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "d72d7545-9afa-45c6-9efd-2619aecfc794", - "metadata": {}, - "source": [ - "#### 3. `CONF/TEST` = `TPR` (to calculate N1: Incidence adjusted for **Testing**)\n", - "This **ratio should** always be **≤ 1** because **there should _not_ be more confirmed cases than tested** ...\n", - "\n", - "(but if very small, then N1 could be smaller or equal to CONF (so ADJ INC ≤ CRUDE))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9cc60295-5046-4932-b332-965fd320f72e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "more_confirmed_than_tested <- monthly_cases |> mutate(CONF_divby_TEST = CONF / TEST) |> filter(CONF_divby_TEST > 1) |> nrow() \n", - "\n", - "if (more_confirmed_than_tested > 0) {\n", - " log_msg(glue(\"🚨 Note: higher confirmed than tested cases (`CONF/TEST`) detected in {more_confirmed_than_tested} rows (ADM2 x MONTH).\"), \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "acbabb99-07ce-4054-a702-2d3cd59c328e", - "metadata": {}, - "source": [ - "### 3.2 **Yearly incidence**\n", - "After calculating N1 and N2 for each `ADM2`-`MONTH`, we aggregate the data annually to compute the yearly totals (sums) for crude cases (`CONF`), `N1` and `N2`. Finally, we compute:\n", - "* Crude incidence: C / POP × 1000\n", - "* Incidence adjusted for testing: N1 / POP × 1000\n", - "* Incidence adjusted for testing and reporting: N2 / POP × 1000\n", - "* Incidence adjusted for testing, reporting and careseeking behaviour (optional): N3 / POP × 1000\n", - "\n", - "--------------" - ] - }, - { - "cell_type": "markdown", - "id": "d47a3908-71cb-4e79-8771-f6caceae4ce2", - "metadata": {}, - "source": [ - "The calculation expects (input):\n", - "* **monthly_cases**: as the output of `calculate_monthly_cases()`, or a tibble/data frame with the following cols: `ADM2`, `YEAR`, `MONTH`, \"value_\" * (CONF, TEST, SUSP, PRES), `TPR`, `N1`, `N2` \n", - "* **population_data**: df of population data formatted and aligned, aggregated at ADM2 and YEAR level. A tibble/data frame that _must_ contain the following cols: `ADM2`, `YEAR`, `POPULATION`\n", - "\n", - "The calculation produces (output): \n", - "* a data frame with the following cols: ADM2_ID, YEAR, CONF, N1, N2, `INCIDENCE_CRUDE`, `INCIDENCE_ADJ_TESTING`, `INCIDENCE_ADJ_REPORTING`\n", - "\n", - "--------------------" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8753721-067b-4da3-8305-8d98f823454f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ---- 1. Enforce column types upfront ----\n", - "monthly_cases <- monthly_cases %>% \n", - " mutate(across(where(is.numeric), as.numeric)) # Convert all numeric columns\n", - " \n", - "population_data <- dhis2_population_adm2 %>% # population_data\n", - " mutate(across(c(YEAR, POPULATION), as.numeric))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4a9ea81-4f7c-4505-8847-07de13831a42", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ---- 2. Core calculation ----\n", - "yearly_incidence <- monthly_cases %>%\n", - " group_by(ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " # 🚨 removed `na.rm = TRUE` on 20250702 - if things break check here! 🚨 \n", - " across(c(CONF, N1, N2), ~sum(.)), #, na.rm = TRUE)), # 🔍 PROBLEM: if NA's in N2 (due to missing RR data), the sum of N2 by YEAR is smaller than the sum of N1 !\n", - " # across(any_of(c(\"CONF\", \"TEST\", \"SUSP\", \"PRES\", \"N1\", \"N2\")), ~sum(.)), # silenced as not necessary to also summarize \"TEST\", \"SUSP\", \"PRES\"\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " left_join(\n", - " population_data,\n", - " by = c(\"ADM2_ID\", \"YEAR\")\n", - " ) %>%\n", - " mutate(\n", - " INCIDENCE_CRUDE = CONF / POPULATION * 1000,\n", - " INCIDENCE_ADJ_TESTING = N1 / POPULATION * 1000,\n", - " INCIDENCE_ADJ_REPORTING = N2 / POPULATION * 1000\n", - " ) |>\n", - " ungroup()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c712f4c2-d677-4d22-8298-111aa0a93034", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ---- 3.1 Optional careseeking data CSV adjustment ---- \n", - "if (!is.null(care_seeking_data_f) && \"N3\" %in% names(monthly_cases)) {\n", - " n3_data <- monthly_cases %>%\n", - " group_by(ADM2_ID, YEAR) %>%\n", - " summarise(N3 = sum(N3, na.rm = TRUE),\n", - " .groups = \"drop\") |>\n", - " ungroup()\n", - " \n", - " yearly_incidence <- yearly_incidence %>%\n", - " left_join(n3_data, by = c(\"ADM2_ID\", \"YEAR\")) %>%\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000\n", - " )\n", - " } else {\n", - " yearly_incidence <- yearly_incidence |>\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = NA\n", - " )\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7602a82b-8829-4613-8961-61c419073269", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ---- 3.2 Optional careseeking adjustment ----\n", - "if (is.null(care_seeking_data_f)) { # quick fix\n", - " \n", - " if (!is.null(careseeking_data) && \"N3\" %in% names(monthly_cases)) {\n", - " n3_data <- monthly_cases %>%\n", - " group_by(ADM2_ID, YEAR) %>%\n", - " summarise(N3 = sum(N3, na.rm = TRUE),\n", - " .groups = \"drop\") |>\n", - " ungroup()\n", - " \n", - " yearly_incidence <- yearly_incidence %>%\n", - " left_join(n3_data, by = c(\"ADM2_ID\", \"YEAR\")) %>%\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000\n", - " )\n", - " } else {\n", - " yearly_incidence <- yearly_incidence |>\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = NA\n", - " )\n", - " }\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56001b15-f74e-42d9-bfa2-bd5563b6a512", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(yearly_incidence, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "7976f894-daf4-46c3-9fa1-5303cbba0818", - "metadata": {}, - "source": [ - "### 🔍 Data **coherence** checks on **yearly incidence**\n", - "Here we check if values of Indicidence (already at `YEAR` resolution) make sense in relation to each other.
\n", - "Namely:\n", - "* crude values should be the lowest, and any consecutive **adjustment** should cause the incidence values to **increase** or remain the **same** - but should never be lower!" - ] - }, - { - "cell_type": "markdown", - "id": "d3dfac34-86f5-4f8c-add9-f54485259924", - "metadata": {}, - "source": [ - "#### 1. `INCIDENCE_ADJ_TESTING` (adj. level 1) should always be greater than `INCIDENCE_CRUDE` (not adjusted)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "acb03778-f1db-4f28-9b09-3cd8d815f976", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# same as below but different cols ... \n", - "# Count TRUE values, handling potential NAs in the result of if_else\n", - "nr_of_impossible_values <- yearly_incidence |>\n", - " mutate(IMPOSSIBLE_VALUE = if_else(INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE, TRUE, FALSE)) |>\n", - " pull(IMPOSSIBLE_VALUE) |>\n", - " sum(na.rm = TRUE) \n", - "\n", - "# Warning if any impossible values are found\n", - "if (nr_of_impossible_values > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: found {nr_of_impossible_values} rows where INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE!\"), \"warning\")\n", - "} else log_msg(\"✅ For all YEAR and ADM2, `INCIDENCE_CRUDE` is smaller than `INCIDENCE_ADJ_TESTING` (as expected).\")\n", - "\n", - "# Check if all values in a column are NA\n", - "if (all(is.na(yearly_incidence$INCIDENCE_ADJ_TESTING))) {\n", - " log_msg(\"🚨 Warning: all values of `INCIDENCE_ADJ_TESTING` are `NA`s\", \"warning\")\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "827d1e84-7f43-404c-88cc-9b675bfa48a1", - "metadata": {}, - "source": [ - "#### 2. `INCIDENCE_ADJ_REPORTING` (adj. level 2) should always be greater than `INCIDENCE_ADJ_TESTING` (adj. level 1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b1da976-b157-4e94-b5e9-8795e87bb416", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Count TRUE values, handling potential NAs in the result of if_else\n", - "nr_of_impossible_values <- yearly_incidence |>\n", - " mutate(IMPOSSIBLE_VALUE = if_else(INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING, TRUE, FALSE)) |>\n", - " pull(IMPOSSIBLE_VALUE) |>\n", - " sum(na.rm = TRUE) \n", - "\n", - "# Warning if any impossible values are found\n", - "if (nr_of_impossible_values > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: found {nr_of_impossible_values} rows where INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING!\"), \"warning\")\n", - "} else log_msg(\"✅ For all YEAR and ADM2, `INCIDENCE_ADJ_TESTING` is smaller than `INCIDENCE_ADJ_REPORTING` (as expected).\")\n", - "\n", - "# Check if all values in a column are NA\n", - "if (all(is.na(yearly_incidence$INCIDENCE_ADJ_REPORTING))) {\n", - " log_msg(\"🚨 Warning: all values of `INCIDENCE_ADJ_REPORTING` are `NA`s\", \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3e57f2e8-1ccc-417c-9fa6-e6b1976336bc", - "metadata": {}, - "source": [ - "## 4. Export to `/data/dhis2_incidence/` folder" - ] - }, - { - "cell_type": "markdown", - "id": "5b6861bb", - "metadata": {}, - "source": [ - "### 4.0. Keep only essential cols \n", - "Based on [SNT Pipelines Data glossary](https://docs.google.com/spreadsheets/d/1qvZMsmCWU6cVLgGZTEXsd5xmoecIxb4LAd-g_2qzYdw/edit?usp=sharing)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4085515", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "yearly_incidence <- yearly_incidence |>\n", - "select(\n", - " YEAR, \n", - " starts_with(\"ADM\"),\n", - " starts_with(\"POPULATION\"),\n", - " starts_with(\"INCIDENCE\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e432998-bf85-4706-bea4-8684b0b58c16", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Reusable function to generate filename and save data\n", - "save_yearly_incidence <- function(yearly_incidence, data_path, file_extension, write_function) {\n", - " \n", - " base_name_parts <- c(COUNTRY_CODE, \"_incidence\")\n", - " \n", - " # Concatenate all parts to form the final filename\n", - " file_name <- paste0(c(base_name_parts, file_extension), collapse = \"\")\n", - " # file_path <- file.path(data_path, \"incidence\", file_name)\n", - " file_path <- file.path(data_path, file_name)\n", - " output_dir <- dirname(file_path)\n", - "\n", - " # Check if the output directory exists, else create it\n", - " if (!dir.exists(output_dir)) {\n", - " dir.create(output_dir, recursive = TRUE)\n", - " }\n", - "\n", - " # Flexibility to use function as provided in argument: \"write_csv\" or \"arrow::write_parquet\" ... \n", - " write_function(yearly_incidence, file_path)\n", - "\n", - " log_msg(paste0(\"Exporting : \", file_path))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3ed49106-b335-42c3-9511-ffd864dd50f0", - "metadata": {}, - "source": [ - "#### 👥 Population Disaggregation logic \n", - "\n", - "Provide a msg to the user to indicate that the results correspond to a specific version of indicators and population (under5, pregnant or totals)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8f15c0c4-74d9-4280-ba46-8add435a9147", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# if (COUNTRY_CODE == \"NER\" & INDICATORS_FOUND) {\n", - "if (INDICATORS_FOUND) {\n", - " log_msg(glue(\"ℹ️ The results have been computed using the following Indicators: {paste(target_colnames, collapse=', ')}\"))\n", - " log_msg(glue(\"ℹ️ The results have been computed using the following Population: {POPULATION_SELECTION}\"))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16e7d83b-3962-4041-9d2d-aaa362b62d5f", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "# Export the data\n", - "\n", - "# CSV\n", - "save_yearly_incidence(yearly_incidence, DATA_PATH, \".csv\", write_csv)\n", - "\n", - "# Parquet\n", - "save_yearly_incidence(yearly_incidence, DATA_PATH, \".parquet\", arrow::write_parquet)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_incidence/country_specific/snt_dhis2_incidence_NER.ipynb b/pipelines/snt_dhis2_incidence/country_specific/snt_dhis2_incidence_NER.ipynb index 8ae33d1..9b0ff8e 100644 --- a/pipelines/snt_dhis2_incidence/country_specific/snt_dhis2_incidence_NER.ipynb +++ b/pipelines/snt_dhis2_incidence/country_specific/snt_dhis2_incidence_NER.ipynb @@ -1,1649 +1,1649 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "f5827740-2917-4504-9017-9ec7d408e5f4", - "metadata": {}, - "source": [ - "Script structure:\n", - " 1. Setup:\n", - " * Paths\n", - " * Utils functions\n", - " * Load and check config file\n", - " 2. Load Data\n", - " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", - " * **Population data** (DHIS2) already formatted & aggregated (output of pipeline YYY) & aggregated at **ADM2 x YEAR** level
\n", - " **Note**: in some Countries (i.e., Niger), population and and crude incidence data is also available for **specific sections** of the popultion (i.e., preganant women, children under 5)\n", - " * (optional) **Care seeking (taux recherche soins)** (DHS)\n", - " * **Reporting Rate**, based on what is available (last run reporting rate pipeline), uses _either_ one of:\n", - " * \"**Dataset**\": pre-cumputed (directly downloadable from SNIS DHIS2 instance) and formatted&aligned elsewhere (output of pipelibe `dhis2-reporting-rate`)\n", - " * \"**Data Element**: calculated from routine DHIS2 data, based on reports for defined indicators and \"active\" facilities\n", - " 3. Calculate **Incidence**\n", - " 1. calculate **monthly cases**\n", - " 2. calculate **yearly incidence**: Crude, Adjusted 1 (Test Positivity Rate), Adjusted 2 (Reporting Rate), (optional) Adjusted 3 (Care Seeking Behaviour)" - ] - }, - { - "cell_type": "markdown", - "id": "cdd5409b-dc0e-45f4-ae4e-dffcdb25059b", - "metadata": {}, - "source": [ - "-------------------\n", - "**Naming harmonization to improve code readability:**\n", - "\n", - "**Incidence**, COLUMN NAMES (always capitalized!):\n", - "* \"INCIDENCE_CRUDE\" = \"Crude\"\n", - "* \"INCIDENCE_ADJ_TESTING\" = \"Adjusted 1 (Testing)\"\n", - "* \"INCIDENCE_ADJ_REPORTING\" = \"Adjusted 2 (Reporting)\"\n", - "* _\"INCIDENCE_ADJ_CARESEEKING\" = \"Adjusted 3 (Careseeking)\"_ ⚠️is this good naming?" - ] - }, - { - "cell_type": "markdown", - "id": "96d5dffc-ff34-4a14-b2b7-1e71e6afad07", - "metadata": {}, - "source": [ - "**Reporting Rate** data frames, based on two **methods**:\n", - "* follwo this structure: reporting\\_rate\\_\\. So:\n", - " * **Dataset**: `reporting_rate_dataset` (for report nb only: `reporting_rate_dataset_year`)\n", - " * **Data Element** (Diallo 2025): `reporting_rate_dataelement` (for report nb only: `reporting_rate_dataelement_year`)" - ] - }, - { - "cell_type": "markdown", - "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", - "metadata": {}, - "source": [ - "--------------------" - ] - }, - { - "cell_type": "markdown", - "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", - "metadata": {}, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "markdown", - "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", - "metadata": {}, - "source": [ - "### 1.0. Fallback parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72fad25e-85fd-4ae9-8fe3-c142077f8d67", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ----- ⚡ Defined in pipeline.py code ---------------\n", - "if (!exists(\"N1_METHOD\")) N1_METHOD <- \"SUSP-TEST\" # ⚡ For N1 calculations: use `SUSP-TEST` or `PRES`\n", - "if (!exists(\"ROUTINE_DATA_CHOICE\")) ROUTINE_DATA_CHOICE <- \"raw\" # \"raw\" \"raw_without_outliers\" \"imputed\"\n", - "if (!exists(\"USE_CSB_DATA\")) USE_CSB_DATA <- FALSE # ⚡ USE_CSB_DATA bool\n", - "if (!exists(\"USE_ADJUSTED_POPULATION\")) USE_ADJUSTED_POPULATION <- FALSE # ⚡ USE_ADJUSTED_POPULATION bool " - ] - }, - { - "cell_type": "markdown", - "id": "d7d2f065-f8ad-4580-aa24-64a6d9bd7acb", - "metadata": {}, - "source": [ - "#### 👥 Population Disaggregation \n", - "Only for countries in which disaggregated data is available. Pipeline fails if you select something you don't have." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63362c4a-6a55-4310-aa7a-81bea39aa734", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!exists(\"DISAGGREGATION_SELECTION\")) DISAGGREGATION_SELECTION <- \"UNDER_5\" # NULL # Options: \"PREGNANT_WOMAN\", \"UNDER_5\", ... \n", - "# Disaggregation options set in pipeline.py parameters, based on \n", - "# https://bluesquare.atlassian.net/browse/SNT25-363?focusedCommentId=85587" - ] - }, - { - "cell_type": "markdown", - "id": "ecff4a51-c6fa-4e84-a465-5bb87d9b1333", - "metadata": {}, - "source": [ - "### 1.1. Run setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# PROJECT PATHS\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2', 'incidence') # store the output of the pipeline (only final results)\n", - "INTERMEDIATE_DATA_PATH <- file.path(DATA_PATH, \"intermediate_results\") # intermediate results for reporting nb or else, NOT for OH Dataset!\n", - "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\")) # utils\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\")) # palettes \n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right path\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22dbb20b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!dir.exists(INTERMEDIATE_DATA_PATH)) {\n", - " dir.create(INTERMEDIATE_DATA_PATH, recursive = TRUE)\n", - " log_msg(glue(\"Created directory for intermediate results: {INTERMEDIATE_DATA_PATH}\"))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", - "metadata": {}, - "source": [ - "### 1.2. Load and check `config` file\n", - "\n", - "**Checks for SNT mandatory configuration fields**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")) \n", - "log_msg(msg)\n", - "\n", - "# Generic\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"ANY\"\n", - "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "\n", - "# Fixed routine formatting columns\n", - "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') \n", - "print(paste(\"Fixed routine data ('dhis2_routine') columns (always expected): \", paste(fixed_cols, collapse=\", \")))" - ] - }, - { - "cell_type": "markdown", - "id": "95006478", - "metadata": {}, - "source": [ - "### 1.3. Helper function(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa504ca5-928c-4778-ad31-5c4de7bbbf60", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# helper function \n", - "resolve_routine_filename <- function(routine_choice) { \n", - " if (routine_choice == \"raw\") return(\"_routine.parquet\")\n", - " is_removed <- FALSE\n", - " if (routine_choice == \"raw_without_outliers\") is_removed <- TRUE \n", - " removed_status <- if (is_removed) \"removed\" else \"imputed\" \n", - " return(glue::glue(\"_routine_outliers_{removed_status}.parquet\"))\n", - "} " - ] - }, - { - "cell_type": "markdown", - "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", - "metadata": {}, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", - "metadata": {}, - "source": [ - "### 2.1. **Routine** data (DHIS2) (parametrized choice)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ddb31b18", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# select routine dataset and filename\n", - "if (ROUTINE_DATA_CHOICE == \"raw\") { \n", - " routine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - " routine_name <- resolve_routine_filename(ROUTINE_DATA_CHOICE)\n", - " routine_filename <- paste0(COUNTRY_CODE, routine_name)\n", - "} else { \n", - " routine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - " routine_name <- resolve_routine_filename(ROUTINE_DATA_CHOICE)\n", - " routine_filename <- paste0(COUNTRY_CODE, routine_name)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30691d35-f859-4f92-8eb2-5791a425f153", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load file from dataset \n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(routine_dataset_name, routine_filename) }, \n", - " error = function(e) { \n", - " # Check if the error message indicates that the file does not exist \n", - " if (grepl(\"does not exist\", conditionMessage(e), ignore.case = TRUE)) { \n", - " msg <- paste0(\"[ERROR] File not found! 🛑 The file `\", routine_filename, \"` does not exist in `\", \n", - " routine_dataset_name, \"`. To generate it, execute the pipeline `DHIS2 Outliers Removal and Imputation`, choosing the appropriate method.\")\n", - " } else {\n", - " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file for: \", COUNTRY_CODE, \". [ERROR DETAILS] \" , conditionMessage(e))\n", - " } \n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"DHIS2 routine data : `\", routine_filename, \"` loaded from dataset : `\", routine_dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", - "log_msg(msg)\n", - "\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "b78c12ec-407f-4088-9a7f-08838b2d208b", - "metadata": {}, - "source": [ - "#### Checks on routine data columns" - ] - }, - { - "cell_type": "markdown", - "id": "b1dcb02d", - "metadata": {}, - "source": [ - " `fixed_cols`: Fixed columns that should be always present regardless of the config." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b3514f20-3726-436e-b34b-7a171d1718d4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if all \"fixed\" cols are present in dhis2_routine\n", - "actual_cols <- colnames(dhis2_routine) # dhis2_routine\n", - "missing_cols <- setdiff(fixed_cols, actual_cols) # Columns in fixed_cols but not in actual_cols)\n", - "\n", - "# Check if all required columns are present\n", - "all_present <- length(missing_cols) == 0\n", - "if (all_present) { \n", - " log_msg(paste0(\"The 'dhis2_routine' tibble contains all the expected 'fixed' columns: \", paste(fixed_cols, collapse = \", \"), \".\"))\n", - "} else {\n", - " log_msg(paste0(\"🚨 Missing Columns: The following required columns are NOT present in 'dhis2_routine': \", paste(missing_cols, collapse = \", \"), \".\"), \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "cd203dec-61b2-4510-9c84-30054e7b99e2", - "metadata": {}, - "source": [ - "`DHIS2_INDICATORS`: Indicators, as defined in the config.json file, are expected to be present if the extraction pipeline and this pipeline are run on the same config settings." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb04b888-8c5e-452a-8eb4-96025b0fa65a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if all \"DHIS2_INDICATORS\" cols are present in dhis2_routine\n", - "missing_cols <- setdiff(DHIS2_INDICATORS, actual_cols) # all elements in DHIS2_INDICATORS but not in actual_cols\n", - "all_present <- length(missing_cols) == 0\n", - "if (all_present) { \n", - " log_msg(paste0(\"The 'dhis2_routine' tibble contains all the expected 'DHIS2_INDICATORS' columns: \", paste(DHIS2_INDICATORS, collapse = \", \"), \".\"))\n", - "} else {\n", - " log_msg(paste0(\n", - " \"🚨 Missing Columns: The following columns for DHIS2 INDICATORS are NOT present in 'dhis2_routine': \",\n", - " paste(missing_cols, collapse = \", \"),\n", - " \".\\n🚨 Looks like the config.json file was modified after extraction.\\n🚨 The analysis will continue WITHOUT the missing indicators.\"\n", - " ), \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3ba1a6e8-aa08-4624-a6a5-4852bf4127e4", - "metadata": {}, - "source": [ - "#### Checks on `N1_METHOD` selected\n", - "_**if**_ `N1_METHOD == PRES` then `PRES` must exist in config.json file _and_ in routine data
\n", - "_**else**_ N1 will use `SUSP-TEST` instead" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "96a7025e-083b-464d-8498-f7fdff493293", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check that col `PRES` exists in both config file and routine data\n", - "if (N1_METHOD == \"PRES\") {\n", - " pres_in_routine <- any(names(dhis2_routine) == \"PRES\")\n", - " pres_in_config <- any(DHIS2_INDICATORS == \"PRES\")\n", - "\n", - " if (!pres_in_routine) {\n", - " log_msg(\"🛑 Column `PRES` missing from routine data! 🚨 N1 calculations will use `SUSP-TEST` instead!\", \"error\")\n", - " stop()\n", - " }\n", - " if (!pres_in_config) {\n", - " log_msg(\"⚙️ Note: `PRES` set as parameter in this pipeline, but not defined as indicator in the configuration file (SNT_config.json)\", \"error\")\n", - " stop()\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "1c5e84cf", - "metadata": {}, - "source": [ - "#### 👥 Population Disaggregation logic" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "458e3d78-3552-4447-93f8-6812a5d655be", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "INDICATORS_FOUND <- FALSE # 👈 \n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && !is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c(\"SUSP-TEST\", \"PRES\")) {\n", - "if (!is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c(\"SUSP-TEST\", \"PRES\")) {\n", - "\n", - " # Determine the dynamic prefix based on the method\n", - " prefix_method <- ifelse(N1_METHOD == \"SUSP-TEST\", \"SUSP\", \"PRES\")\n", - " prefix_all <- c(prefix_method, \"TEST\", \"CONF\") \n", - " # Define the expected column names \n", - " # (also make available for the 'else' warning message if the check fails)\n", - " target_colnames <- glue(\"{prefix_all}_{DISAGGREGATION_SELECTION}\")\n", - " \n", - " if (all(target_colnames %in% colnames(dhis2_routine))) {\n", - " \n", - " # We map the specific columns (e.g., SUSP_UNDER5) to generic names (e.g., SUSP)\n", - " dhis2_routine[prefix_all] <- dhis2_routine[target_colnames]\n", - " \n", - " for (col in target_colnames) {\n", - " log_msg(glue(\"Population Disaggregation: Successfully mapped indicator: {col}\"))\n", - " }\n", - " \n", - " # Signal success for the next code block\n", - " INDICATORS_FOUND <- TRUE\n", - " \n", - " } else {\n", - " missing_cols <- setdiff(target_colnames, colnames(dhis2_routine))\n", - " log_msg(glue(\"Population Disaggregation: Disaggregation on '{DISAGGREGATION_SELECTION}' failed.\"), \"warning\")\n", - " log_msg(glue(\"Population Disaggregation: Missing columns in routine dataset: {paste(missing_cols, collapse = ', ')}\"), \"warning\")\n", - " \n", - " msg <- glue(\"[ERROR] 🛑 Population Disaggregation: Required columns for disaggregation '{DISAGGREGATION_SELECTION}' are missing.\") \n", - " stop(msg)\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "4473e75e-94d2-4f24-b6eb-38a7685542ad", - "metadata": {}, - "source": [ - "### 2.2. Load population data at level ADM2 x YEAR\n", - "\n", - "Already formatted & aggregated. \n", - "\n", - "**Expecting** table with these **cols** (bold = **must have**): \n", - "* ADM1_ID\n", - "* **ADM2_ID**\n", - "* **YEAR**\n", - "* **POPULATION** (pop at ADM2 level)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ce922c7-6dab-44cf-a94f-8a03d1f816a1", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Select population file \n", - "if (USE_ADJUSTED_POPULATION) {\n", - " dhis2_pop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_POPULATION_TRANSFORMATION\n", - "} else {\n", - " dhis2_pop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "}\n", - " \n", - "# Load file from dataset\n", - "dhis2_population_adm2 <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_pop_dataset, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 population file for: \" , COUNTRY_CODE, \n", - " \" [ERROR DETAILS] \", conditionMessage(e)) # log error message , \n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "log_msg(glue(\"DHIS2 population data loaded from dataset: {dhis2_pop_dataset}. Dataframe dimensions: {paste(dim(dhis2_population_adm2), collapse=', ')}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7163965", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhis2_population_adm2 |> head()" - ] - }, - { - "cell_type": "markdown", - "id": "6ae0c5fa", - "metadata": {}, - "source": [ - "#### 👥 Population Disaggregation logic" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c8620491", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (INDICATORS_FOUND) { \n", - " POPULATION_SELECTION <- paste0(\"POP_\", DISAGGREGATION_SELECTION) \n", - " if (!(POPULATION_SELECTION %in% colnames(dhis2_population_adm2))) {\n", - " log_msg(glue(\"Population Disaggregation: Column '{POPULATION_SELECTION}' not found in Population dataset.\"), \"warning\")\n", - " POPULATION_SELECTION <- \"POPULATION\"\n", - " }\n", - " # The selected column is assigned to POPULATION col so that later code can use it generically\n", - " dhis2_population_adm2$POPULATION <- dhis2_population_adm2[[POPULATION_SELECTION]]\n", - " log_msg(glue(\"Population Disaggregation: Column '{POPULATION_SELECTION}' selected as population values.\"))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e596d0ed-56df-4756-83ed-717cfa72f643", - "metadata": {}, - "source": [ - "#### 2.2.1 **Population** data (DHIS2) columns selection.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5107756-f007-4c39-a4f6-b2ab0a653bd5", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhis2_population_adm2 <- dhis2_population_adm2 %>% select(YEAR, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, POPULATION) \n", - "\n", - "dim(dhis2_population_adm2)\n", - "head(dhis2_population_adm2, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "b42a65ab-ad8d-41ba-9edb-dc2636f03a06", - "metadata": {}, - "source": [ - "### 2.3. (optional) **Care Seeking Behaviour** (CSB DHS) (taux recherche soins)\n", - "(20250728) Note: **changed units** (proportion to %), see https://bluesquare.atlassian.net/browse/SNT25-127 " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a6a5338-9ffd-47d2-b92f-79deb7886078", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHS_INDICATORS\n", - "file_name <- glue::glue(\"{COUNTRY_CODE}_DHS_ADM1_PCT_CARESEEKING_SAMPLE_AVERAGE.parquet\")\n", - "\n", - "if (USE_CSB_DATA == TRUE) {\n", - " # Read the data, if error (cannot find at defined path) -> set careseeking_data to NULL (so it doesn't break the function at # 3.)\n", - " careseeking_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"🛑 Error while loading DHS Care Seeking data file from `\", dataset_name, file_name ,\"`.\", conditionMessage(e)) # log error message\n", - " log_msg(msg, \"error\")\n", - " return(NULL) # make object NULL on error\n", - " })\n", - " \n", - " # Only print success messages and data info if careseeking_data is NOT NULL\n", - " if (!is.null(careseeking_data)) {\n", - " log_msg(paste0(\"Care Seeking data : \", file_name, \" loaded from dataset : \", dataset_name))\n", - " log_msg(paste0(\"Care Seeking data frame dimensions: \", nrow(careseeking_data), \" rows, \", ncol(careseeking_data), \" columns.\"))\n", - " head(careseeking_data)\n", - " } else {\n", - " log_msg(paste0(\"🚨 Care-seeking data not loaded due to an error, `careseeking_data` is set to `NULL`!\"), \"warning\")\n", - " }\n", - " \n", - "} else {\n", - " # if `USE_CSB_DATA == FALSE` ... (basically, ignore CSB data)\n", - " careseeking_data <- NULL\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "92723594-000b-41ee-82a1-8e69106a277d", - "metadata": {}, - "source": [ - "### 2.4. Load Reporting Rate \n", - "\n", - "Import Reporting Rate file based on what is available in the latest OH Dataset version (which depends on last run reporting rate pipepline).\n", - "\n", - "📅 **Important**: reporting rate must be **monthly**!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5722d64-ce61-4244-960e-57ebac28e4cf", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "f5827740-2917-4504-9017-9ec7d408e5f4", + "metadata": {}, + "source": [ + "Script structure:\n", + " 1. Setup:\n", + " * Paths\n", + " * Utils functions\n", + " * Load and check config file\n", + " 2. Load Data\n", + " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", + " * **Population data** (DHIS2) already formatted & aggregated (output of pipeline YYY) & aggregated at **ADM2 x YEAR** level
\n", + " **Note**: in some Countries (i.e., Niger), population and and crude incidence data is also available for **specific sections** of the popultion (i.e., preganant women, children under 5)\n", + " * (optional) **Care seeking (taux recherche soins)** (DHS)\n", + " * **Reporting Rate**, based on what is available (last run reporting rate pipeline), uses _either_ one of:\n", + " * \"**Dataset**\": pre-cumputed (directly downloadable from SNIS DHIS2 instance) and formatted&aligned elsewhere (output of pipelibe `dhis2-reporting-rate`)\n", + " * \"**Data Element**: calculated from routine DHIS2 data, based on reports for defined indicators and \"active\" facilities\n", + " 3. Calculate **Incidence**\n", + " 1. calculate **monthly cases**\n", + " 2. calculate **yearly incidence**: Crude, Adjusted 1 (Test Positivity Rate), Adjusted 2 (Reporting Rate), (optional) Adjusted 3 (Care Seeking Behaviour)" + ] + }, + { + "cell_type": "markdown", + "id": "cdd5409b-dc0e-45f4-ae4e-dffcdb25059b", + "metadata": {}, + "source": [ + "-------------------\n", + "**Naming harmonization to improve code readability:**\n", + "\n", + "**Incidence**, COLUMN NAMES (always capitalized!):\n", + "* \"INCIDENCE_CRUDE\" = \"Crude\"\n", + "* \"INCIDENCE_ADJ_TESTING\" = \"Adjusted 1 (Testing)\"\n", + "* \"INCIDENCE_ADJ_REPORTING\" = \"Adjusted 2 (Reporting)\"\n", + "* _\"INCIDENCE_ADJ_CARESEEKING\" = \"Adjusted 3 (Careseeking)\"_ ⚠️is this good naming?" + ] + }, + { + "cell_type": "markdown", + "id": "96d5dffc-ff34-4a14-b2b7-1e71e6afad07", + "metadata": {}, + "source": [ + "**Reporting Rate** data frames, based on two **methods**:\n", + "* follwo this structure: reporting\\_rate\\_\\. So:\n", + " * **Dataset**: `reporting_rate_dataset` (for report nb only: `reporting_rate_dataset_year`)\n", + " * **Data Element** (Diallo 2025): `reporting_rate_dataelement` (for report nb only: `reporting_rate_dataelement_year`)" + ] + }, + { + "cell_type": "markdown", + "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", + "metadata": {}, + "source": [ + "--------------------" + ] + }, + { + "cell_type": "markdown", + "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "markdown", + "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", + "metadata": {}, + "source": [ + "### 1.0. Fallback parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72fad25e-85fd-4ae9-8fe3-c142077f8d67", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ----- ⚡ Defined in pipeline.py code ---------------\n", + "if (!exists(\"N1_METHOD\")) N1_METHOD <- \"SUSP-TEST\" # ⚡ For N1 calculations: use `SUSP-TEST` or `PRES`\n", + "if (!exists(\"ROUTINE_DATA_CHOICE\")) ROUTINE_DATA_CHOICE <- \"raw\" # \"raw\" \"outliers_removed\" \"imputed\"\n", + "if (!exists(\"USE_CSB_DATA\")) USE_CSB_DATA <- FALSE # ⚡ USE_CSB_DATA bool\n", + "if (!exists(\"USE_ADJUSTED_POPULATION\")) USE_ADJUSTED_POPULATION <- FALSE # ⚡ USE_ADJUSTED_POPULATION bool " + ] + }, + { + "cell_type": "markdown", + "id": "d7d2f065-f8ad-4580-aa24-64a6d9bd7acb", + "metadata": {}, + "source": [ + "#### 👥 Population Disaggregation \n", + "Only for countries in which disaggregated data is available. Pipeline fails if you select something you don't have." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63362c4a-6a55-4310-aa7a-81bea39aa734", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!exists(\"DISAGGREGATION_SELECTION\")) DISAGGREGATION_SELECTION <- \"UNDER_5\" # NULL # Options: \"PREGNANT_WOMAN\", \"UNDER_5\", ... \n", + "# Disaggregation options set in pipeline.py parameters, based on \n", + "# https://bluesquare.atlassian.net/browse/SNT25-363?focusedCommentId=85587" + ] + }, + { + "cell_type": "markdown", + "id": "ecff4a51-c6fa-4e84-a465-5bb87d9b1333", + "metadata": {}, + "source": [ + "### 1.1. Run setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# PROJECT PATHS\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2', 'incidence') # store the output of the pipeline (only final results)\n", + "INTERMEDIATE_DATA_PATH <- file.path(DATA_PATH, \"intermediate_results\") # intermediate results for reporting nb or else, NOT for OH Dataset!\n", + "\n", + "source(file.path(CODE_PATH, \"snt_utils.r\")) # utils\n", + "source(file.path(CODE_PATH, \"snt_palettes.r\")) # palettes \n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Set environment to load openhexa.sdk from the right path\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22dbb20b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!dir.exists(INTERMEDIATE_DATA_PATH)) {\n", + " dir.create(INTERMEDIATE_DATA_PATH, recursive = TRUE)\n", + " log_msg(glue(\"Created directory for intermediate results: {INTERMEDIATE_DATA_PATH}\"))\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", + "metadata": {}, + "source": [ + "### 1.2. Load and check `config` file\n", + "\n", + "**Checks for SNT mandatory configuration fields**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")) \n", + "log_msg(msg)\n", + "\n", + "# Generic\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"ANY\"\n", + "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", + "\n", + "# Fixed routine formatting columns\n", + "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') \n", + "print(paste(\"Fixed routine data ('dhis2_routine') columns (always expected): \", paste(fixed_cols, collapse=\", \")))" + ] + }, + { + "cell_type": "markdown", + "id": "95006478", + "metadata": {}, + "source": [ + "### 1.3. Helper function(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa504ca5-928c-4778-ad31-5c4de7bbbf60", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# helper function \n", + "resolve_routine_filename <- function(routine_choice) { \n", + " if (routine_choice == \"raw\") return(\"_routine.parquet\")\n", + " is_removed <- FALSE\n", + " if (routine_choice == \"outliers_removed\") is_removed <- TRUE \n", + " removed_status <- if (is_removed) \"removed\" else \"imputed\" \n", + " return(glue::glue(\"_routine_outliers_{removed_status}.parquet\"))\n", + "} " + ] + }, + { + "cell_type": "markdown", + "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", + "metadata": {}, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", + "metadata": {}, + "source": [ + "### 2.1. **Routine** data (DHIS2) (parametrized choice)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddb31b18", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# select routine dataset and filename\n", + "if (ROUTINE_DATA_CHOICE == \"raw\") { \n", + " routine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + " routine_name <- resolve_routine_filename(ROUTINE_DATA_CHOICE)\n", + " routine_filename <- paste0(COUNTRY_CODE, routine_name)\n", + "} else { \n", + " routine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", + " routine_name <- resolve_routine_filename(ROUTINE_DATA_CHOICE)\n", + " routine_filename <- paste0(COUNTRY_CODE, routine_name)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30691d35-f859-4f92-8eb2-5791a425f153", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load file from dataset \n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(routine_dataset_name, routine_filename) }, \n", + " error = function(e) { \n", + " # Check if the error message indicates that the file does not exist \n", + " if (grepl(\"does not exist\", conditionMessage(e), ignore.case = TRUE)) { \n", + " msg <- paste0(\"[ERROR] File not found! 🛑 The file `\", routine_filename, \"` does not exist in `\", \n", + " routine_dataset_name, \"`. To generate it, execute the pipeline `DHIS2 Outliers Removal and Imputation`, choosing the appropriate method.\")\n", + " } else {\n", + " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file for: \", COUNTRY_CODE, \". [ERROR DETAILS] \" , conditionMessage(e))\n", + " } \n", + " stop(msg)\n", + "})\n", + "\n", + "msg <- paste0(\"DHIS2 routine data : `\", routine_filename, \"` loaded from dataset : `\", routine_dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", + "log_msg(msg)\n", + "\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "b78c12ec-407f-4088-9a7f-08838b2d208b", + "metadata": {}, + "source": [ + "#### Checks on routine data columns" + ] + }, + { + "cell_type": "markdown", + "id": "b1dcb02d", + "metadata": {}, + "source": [ + " `fixed_cols`: Fixed columns that should be always present regardless of the config." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3514f20-3726-436e-b34b-7a171d1718d4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if all \"fixed\" cols are present in dhis2_routine\n", + "actual_cols <- colnames(dhis2_routine) # dhis2_routine\n", + "missing_cols <- setdiff(fixed_cols, actual_cols) # Columns in fixed_cols but not in actual_cols)\n", + "\n", + "# Check if all required columns are present\n", + "all_present <- length(missing_cols) == 0\n", + "if (all_present) { \n", + " log_msg(paste0(\"The 'dhis2_routine' tibble contains all the expected 'fixed' columns: \", paste(fixed_cols, collapse = \", \"), \".\"))\n", + "} else {\n", + " log_msg(paste0(\"🚨 Missing Columns: The following required columns are NOT present in 'dhis2_routine': \", paste(missing_cols, collapse = \", \"), \".\"), \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "cd203dec-61b2-4510-9c84-30054e7b99e2", + "metadata": {}, + "source": [ + "`DHIS2_INDICATORS`: Indicators, as defined in the config.json file, are expected to be present if the extraction pipeline and this pipeline are run on the same config settings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb04b888-8c5e-452a-8eb4-96025b0fa65a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if all \"DHIS2_INDICATORS\" cols are present in dhis2_routine\n", + "missing_cols <- setdiff(DHIS2_INDICATORS, actual_cols) # all elements in DHIS2_INDICATORS but not in actual_cols\n", + "all_present <- length(missing_cols) == 0\n", + "if (all_present) { \n", + " log_msg(paste0(\"The 'dhis2_routine' tibble contains all the expected 'DHIS2_INDICATORS' columns: \", paste(DHIS2_INDICATORS, collapse = \", \"), \".\"))\n", + "} else {\n", + " log_msg(paste0(\n", + " \"🚨 Missing Columns: The following columns for DHIS2 INDICATORS are NOT present in 'dhis2_routine': \",\n", + " paste(missing_cols, collapse = \", \"),\n", + " \".\\n🚨 Looks like the config.json file was modified after extraction.\\n🚨 The analysis will continue WITHOUT the missing indicators.\"\n", + " ), \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3ba1a6e8-aa08-4624-a6a5-4852bf4127e4", + "metadata": {}, + "source": [ + "#### Checks on `N1_METHOD` selected\n", + "_**if**_ `N1_METHOD == PRES` then `PRES` must exist in config.json file _and_ in routine data
\n", + "_**else**_ N1 will use `SUSP-TEST` instead" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96a7025e-083b-464d-8498-f7fdff493293", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check that col `PRES` exists in both config file and routine data\n", + "if (N1_METHOD == \"PRES\") {\n", + " pres_in_routine <- any(names(dhis2_routine) == \"PRES\")\n", + " pres_in_config <- any(DHIS2_INDICATORS == \"PRES\")\n", + "\n", + " if (!pres_in_routine) {\n", + " log_msg(\"🛑 Column `PRES` missing from routine data! 🚨 N1 calculations will use `SUSP-TEST` instead!\", \"error\")\n", + " stop()\n", + " }\n", + " if (!pres_in_config) {\n", + " log_msg(\"⚙️ Note: `PRES` set as parameter in this pipeline, but not defined as indicator in the configuration file (SNT_config.json)\", \"error\")\n", + " stop()\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "1c5e84cf", + "metadata": {}, + "source": [ + "#### 👥 Population Disaggregation logic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "458e3d78-3552-4447-93f8-6812a5d655be", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "INDICATORS_FOUND <- FALSE # 👈 \n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && !is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c(\"SUSP-TEST\", \"PRES\")) {\n", + "if (!is.null(DISAGGREGATION_SELECTION) && N1_METHOD %in% c(\"SUSP-TEST\", \"PRES\")) {\n", + "\n", + " # Determine the dynamic prefix based on the method\n", + " prefix_method <- ifelse(N1_METHOD == \"SUSP-TEST\", \"SUSP\", \"PRES\")\n", + " prefix_all <- c(prefix_method, \"TEST\", \"CONF\") \n", + " # Define the expected column names \n", + " # (also make available for the 'else' warning message if the check fails)\n", + " target_colnames <- glue(\"{prefix_all}_{DISAGGREGATION_SELECTION}\")\n", + " \n", + " if (all(target_colnames %in% colnames(dhis2_routine))) {\n", + " \n", + " # We map the specific columns (e.g., SUSP_UNDER5) to generic names (e.g., SUSP)\n", + " dhis2_routine[prefix_all] <- dhis2_routine[target_colnames]\n", + " \n", + " for (col in target_colnames) {\n", + " log_msg(glue(\"Population Disaggregation: Successfully mapped indicator: {col}\"))\n", + " }\n", + " \n", + " # Signal success for the next code block\n", + " INDICATORS_FOUND <- TRUE\n", + " \n", + " } else {\n", + " missing_cols <- setdiff(target_colnames, colnames(dhis2_routine))\n", + " log_msg(glue(\"Population Disaggregation: Disaggregation on '{DISAGGREGATION_SELECTION}' failed.\"), \"warning\")\n", + " log_msg(glue(\"Population Disaggregation: Missing columns in routine dataset: {paste(missing_cols, collapse = ', ')}\"), \"warning\")\n", + " \n", + " msg <- glue(\"[ERROR] 🛑 Population Disaggregation: Required columns for disaggregation '{DISAGGREGATION_SELECTION}' are missing.\") \n", + " stop(msg)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "4473e75e-94d2-4f24-b6eb-38a7685542ad", + "metadata": {}, + "source": [ + "### 2.2. Load population data at level ADM2 x YEAR\n", + "\n", + "Already formatted & aggregated. \n", + "\n", + "**Expecting** table with these **cols** (bold = **must have**): \n", + "* ADM1_ID\n", + "* **ADM2_ID**\n", + "* **YEAR**\n", + "* **POPULATION** (pop at ADM2 level)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ce922c7-6dab-44cf-a94f-8a03d1f816a1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Select population file \n", + "if (USE_ADJUSTED_POPULATION) {\n", + " dhis2_pop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_POPULATION_TRANSFORMATION\n", + "} else {\n", + " dhis2_pop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "}\n", + " \n", + "# Load file from dataset\n", + "dhis2_population_adm2 <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_pop_dataset, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"[ERROR] Error while loading DHIS2 population file for: \" , COUNTRY_CODE, \n", + " \" [ERROR DETAILS] \", conditionMessage(e)) # log error message , \n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "log_msg(glue(\"DHIS2 population data loaded from dataset: {dhis2_pop_dataset}. Dataframe dimensions: {paste(dim(dhis2_population_adm2), collapse=', ')}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7163965", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhis2_population_adm2 |> head()" + ] + }, + { + "cell_type": "markdown", + "id": "6ae0c5fa", + "metadata": {}, + "source": [ + "#### 👥 Population Disaggregation logic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8620491", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (INDICATORS_FOUND) { \n", + " POPULATION_SELECTION <- paste0(\"POP_\", DISAGGREGATION_SELECTION) \n", + " if (!(POPULATION_SELECTION %in% colnames(dhis2_population_adm2))) {\n", + " log_msg(glue(\"Population Disaggregation: Column '{POPULATION_SELECTION}' not found in Population dataset.\"), \"warning\")\n", + " POPULATION_SELECTION <- \"POPULATION\"\n", + " }\n", + " # The selected column is assigned to POPULATION col so that later code can use it generically\n", + " dhis2_population_adm2$POPULATION <- dhis2_population_adm2[[POPULATION_SELECTION]]\n", + " log_msg(glue(\"Population Disaggregation: Column '{POPULATION_SELECTION}' selected as population values.\"))\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e596d0ed-56df-4756-83ed-717cfa72f643", + "metadata": {}, + "source": [ + "#### 2.2.1 **Population** data (DHIS2) columns selection.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5107756-f007-4c39-a4f6-b2ab0a653bd5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhis2_population_adm2 <- dhis2_population_adm2 %>% select(YEAR, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, POPULATION) \n", + "\n", + "dim(dhis2_population_adm2)\n", + "head(dhis2_population_adm2, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "b42a65ab-ad8d-41ba-9edb-dc2636f03a06", + "metadata": {}, + "source": [ + "### 2.3. (optional) **Care Seeking Behaviour** (CSB DHS) (taux recherche soins)\n", + "(20250728) Note: **changed units** (proportion to %), see https://bluesquare.atlassian.net/browse/SNT25-127 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a6a5338-9ffd-47d2-b92f-79deb7886078", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHS_INDICATORS\n", + "file_name <- glue::glue(\"{COUNTRY_CODE}_DHS_ADM1_PCT_CARESEEKING_SAMPLE_AVERAGE.parquet\")\n", + "\n", + "if (USE_CSB_DATA == TRUE) {\n", + " # Read the data, if error (cannot find at defined path) -> set careseeking_data to NULL (so it doesn't break the function at # 3.)\n", + " careseeking_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", + " error = function(e) {\n", + " msg <- paste(\"🛑 Error while loading DHS Care Seeking data file from `\", dataset_name, file_name ,\"`.\", conditionMessage(e)) # log error message\n", + " log_msg(msg, \"error\")\n", + " return(NULL) # make object NULL on error\n", + " })\n", + " \n", + " # Only print success messages and data info if careseeking_data is NOT NULL\n", + " if (!is.null(careseeking_data)) {\n", + " log_msg(paste0(\"Care Seeking data : \", file_name, \" loaded from dataset : \", dataset_name))\n", + " log_msg(paste0(\"Care Seeking data frame dimensions: \", nrow(careseeking_data), \" rows, \", ncol(careseeking_data), \" columns.\"))\n", + " head(careseeking_data)\n", + " } else {\n", + " log_msg(paste0(\"🚨 Care-seeking data not loaded due to an error, `careseeking_data` is set to `NULL`!\"), \"warning\")\n", + " }\n", + " \n", + "} else {\n", + " # if `USE_CSB_DATA == FALSE` ... (basically, ignore CSB data)\n", + " careseeking_data <- NULL\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "92723594-000b-41ee-82a1-8e69106a277d", + "metadata": {}, + "source": [ + "### 2.4. Load Reporting Rate \n", + "\n", + "Import Reporting Rate file based on what is available in the latest OH Dataset version (which depends on last run reporting rate pipepline).\n", + "\n", + "📅 **Important**: reporting rate must be **monthly**!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5722d64-ce61-4244-960e-57ebac28e4cf", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# function**\n", + "# Define dataset and file name (based on paramter)\n", + "rr_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", + "file_name_de <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", + "file_name_ds <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", + "\n", + "# Try loading dataelement reporting rates.\n", + "reporting_rate_month <- tryCatch({\n", + " df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_de)\n", + " log_msg(glue(\"Reporting Rate data: `{file_name_de}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df), collapse=', ')}\"))\n", + " REPORTING_RATE_METHOD <- \"dataelement\"\n", + " df_loaded\n", + "}, \n", + " error = function(e) { \n", + " cat(glue(\"[ERROR] Error while loading Reporting Rate 'dataelement' version for: {COUNTRY_CODE} {conditionMessage(e)}\"))\n", + " return(NULL)\n", + "})\n", + "\n", + "# Try loading dataset reporting rates.\n", + "if (is.null(reporting_rate_month)) {\n", + " reporting_rate_month <- tryCatch({\n", + " df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_ds) \n", + " log_msg(glue(\"Reporting Rate data: `{file_name_ds}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df), collapse=', ')}\"))\n", + " REPORTING_RATE_METHOD <- \"dataset\"\n", + " df_loaded\n", + " }, \n", + " error = function(e) { \n", + " stop(glue(\"[ERROR] Error while loading Reporting Rate 'dataset' version for: {COUNTRY_CODE} {conditionMessage(e)}\")) # raise error\n", + " })\n", + "}\n", + "\n", + "rm(df_loaded)\n", + "dim(reporting_rate_month)\n", + "head(reporting_rate_month, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "9d2529ad-8436-43c4-85b3-ed1ad9621e1e", + "metadata": {}, + "source": [ + "#### 🔍 Checkon data completeness for `REPORTING_RATE` data\n", + "Normally we should have \"complete\" data (no missing or `NA` values). However, when using certain datasets (from pipeline: \"Reporting Rate (Dataset)\") we might have incomplete coverage and hence `NA`s ...
\n", + "These are \"problematic\" because **N2** (Incidence adj 2) will also have `NA` values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eae2a67f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check on data completeness for REPORTING RATE data: \n", + "# check how many values of REPORTING_RATE are NA\n", + "na_count <- sum(is.na(reporting_rate_month$REPORTING_RATE)) \n", + "if (na_count > 0) {\n", + " log_msg(glue(\"⚠️ Warning: Reporting Rate data contains {na_count} missing values (NA) in 'REPORTING_RATE' column.\"), \"warning\")\n", + "} else {\n", + " log_msg(\"✅ Reporting Rate data contains no missing values (NA) in 'REPORTING_RATE' column.\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9cfa7211-5595-4ed6-9699-0f35aebcbc09", + "metadata": {}, + "source": [ + "### 2.5. Load Care seeking data (file) \n", + "\n", + "Load if available" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94ede0e7-e0a8-4e06-ad6c-485869b6d4a9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if the file exist, and try loading it..\n", + "if (file.exists(file.path(SNT_ROOT_PATH, \"uploads/care_seeking_ADM1.csv\"))) {\n", + " \n", + " care_seeking_data_f <- tryCatch({ read.csv(file.path(SNT_ROOT_PATH, \"uploads/care_seeking_ADM1.csv\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading Care Seeking data (NER)\", conditionMessage(e)) \n", + " log_msg(msg)\n", + " stop(msg) \n", + " })\n", + "\n", + " log_msg(\"Care seeking data file loaded: 'uploads/care_seeking_ADM1.csv' (NER Specific).\")\n", + " \n", + " # ensure numeric\n", + " care_seeking_data_f$PCT <- as.numeric(care_seeking_data_f$PCT) \n", + " \n", + "} else {\n", + " care_seeking_data_f <- NULL\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46abe114-0f56-48df-bae7-44147602027c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "care_seeking_data_f" + ] + }, + { + "cell_type": "markdown", + "id": "06f0ebcc-6b87-4d77-98ef-7b8d84be6a0a", + "metadata": {}, + "source": [ + "-------------------------------" + ] + }, + { + "cell_type": "markdown", + "id": "9943c1e5-4d95-4210-8b77-09c4085a96b8", + "metadata": {}, + "source": [ + "## 3. Calculate Incidence\n", + "First calculate monthly cases, then yearly incidence." + ] + }, + { + "cell_type": "markdown", + "id": "8769a974-de8a-4a1f-8f74-edb318a28060", + "metadata": {}, + "source": [ + "### 3.1 **Monthly cases**\n", + "\n", + "\n", + "These methods follow the standard WHO approach for estimating malaria incidence from routine health information systems (WHO, 2023).\n", + "As shown in the code, we begin by calculating **monthly malaria case metrics** (confirmed, tested, presumed) at the **ADM2** level and join them with the **monthly reporting rate**. \n", + "\n", + "This allows us to compute the **test positivity rate** (TPR, where `TPR` = `CONF` / `TEST`) and adjust for incomplete testing using the formula: \n", + "> **N1** = `CONF` + (`PRES` × `CONF` / `TEST`)\n", + "\n", + "Which is equivalent to:\n", + "> **N1** = `CONF` + (`PRES` × **TPR**)\n", + "\n", + "where:\n", + "- **N1** = cases adjusted for testing gaps \n", + "- `CONF` = **confirmed** cases\n", + "- `PRES` = **presumed** cases (either `SUSP` - `TEST` or directly available as `PRES`) 👈 this is a parameter (`N1_METHOD`)\n", + "- `TEST` = **tested** cases \n", + "- **TPR** = Test Positivity Rate (`CONF` / `TEST`)\n", + " \n", + "This produces `N1`, the number of cases adjusted for testing gaps, calculated at the monthly level in line with WHO recommendations to capture intra-annual variation.\n", + "\n", + "Next, we adjust for incomplete reporting using: \n", + "> **N2** = **N1** / `REPORTING_RATE`\n", + "\n", + "where `REPORTING_RATE` is at the monthly levele, and is the ratio of received reports (submission to DHIS2) divided by the expected reports.\n", + "\n", + "Finally, _if_ **careseeking** data is **available**, N3 is calculated as follows:\n", + "> **N3** = N2 + (N2 * PROP_PRIV / PROP_PUBL) + (N2 * NO_TREAT / PROP_PUBL)\n", + "\n", + "where:\n", + "- PRIVATE_CARE = proportion of kids treated in the **private** sector\n", + "- PUBLIC_CARE = proportion of kids treated in the **public** sector\n", + "- NO_CARE = proportion of kids which **did not receive any treatment**\n", + "\n", + "Note that this assumes the same TPR across all sectors (private and public).\n", + "\n", + "\n", + "\n", + "**Important note**
\n", + "In case reporting rate equals zero (none of the health facilities reported in a given month), N2 is set to `NA`. Note that the annual N2 will be underestimated, which is preferable compared to having `Inf` values.\n", + "\n", + "-------------" + ] + }, + { + "cell_type": "markdown", + "id": "dcee32af-ae6d-4b2a-9c7a-f846209f1dc3", + "metadata": {}, + "source": [ + "This calculation expects (input):\n", + "* **routine_data**: DHIS2 routine data, formatted and aggregated at ADM2 and MONTH level. Tibble (df) _must_ contain the following cols: `YEAR`, `MONTH`, `ADM2`, `CONF`, `TEST`, `SUSP`, `PRES`. \n", + "* **reporting_rate_data**: reporting rate calculated at ADM2 and MONTH level and expressed as proprtion **(0-1)**. Tibble (df) _must_ contain the following cols: `ADM2`, `YEAR`, `MONTH`, `reporting_rate`\n", + "\n", + "The calculation produces (output):\n", + "* data frame with the following cols: `ADM2`, `YEAR`, `MONTH`, \"value_\" * (`CONF`, `TEST`, `SUSP`, `PRES`), `TPR`, `N1`, `N2`\n", + "\n", + "-----------------" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1a0899a-3308-4d90-b06e-8a0cd4b849e1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Ensure correct data type for numerical columns ---------------------------------------\n", + "routine_data <- dhis2_routine %>%\n", + " mutate(across(any_of(c(\"YEAR\", \"MONTH\", \"CONF\", \"TEST\", \"SUSP\", \"PRES\")), as.numeric))\n", + "\n", + "reporting_rate_data <- reporting_rate_month %>% # reporting_rate_data\n", + " mutate(across(c(YEAR, MONTH, REPORTING_RATE), as.numeric))" + ] + }, + { + "cell_type": "markdown", + "id": "736dec8f", + "metadata": {}, + "source": [ + "#### 3.1.0. Aggregate at `ADM2` x `MONTH` & calculate **TPR**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8899964a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check for TEST > SUSP\n", + "routine_data |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58bf219e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Group & compute TPR\n", + "monthly_cases <- routine_data %>%\n", + " group_by(ADM1_ID, ADM2_ID, YEAR, MONTH) %>% # ADM1 needed to join careseeking data\n", + " summarise(\n", + " CONF = sum(CONF, na.rm = TRUE),\n", + " TEST = sum(TEST, na.rm = TRUE),\n", + " SUSP = sum(SUSP, na.rm = TRUE),\n", + " across(any_of(\"PRES\"), ~sum(., na.rm = TRUE), .names = \"PRES\"), # <- handles missing 'PRES' column gracefully\n", + " .groups = \"drop\") %>%\n", + " # Cleaning TEST data for \"SUSP-TEST\" method\n", + " mutate(TEST = ifelse(N1_METHOD == \"SUSP-TEST\" & !is.na(SUSP) & (TEST > SUSP), SUSP, TEST)) %>%\n", + " left_join(reporting_rate_data,\n", + " by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>% \n", + " # Calculate TPR based on CONF and TEST\n", + " # Note: if TEST is 0 or NA, set TPR = 1 (to avoid division by zero which produces Inf)\n", + " mutate( \n", + " TPR = ifelse(!is.na(CONF) & !is.na(TEST) & (TEST != 0), CONF / TEST, 1)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938f0194", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check for TEST > SUSP\n", + "monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() " + ] + }, + { + "cell_type": "markdown", + "id": "df43d6d8", + "metadata": {}, + "source": [ + "#### 3.1.1. Calculate **N1**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ead591bb-3936-486d-bb9e-7b01d0805d0d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate N1 based on `N1_METHOD` & availability of `PRES` \n", + "\n", + "if (N1_METHOD == \"SUSP-TEST\") {\n", + " monthly_cases <- monthly_cases %>%\n", + " mutate(N1 = CONF + ((SUSP - TEST) * TPR))\n", + " log_msg(\"Calculating N1 as `N1 = CONF + ((SUSP - TEST) * TPR)`\")\n", + "} else if (N1_METHOD == \"PRES\") {\n", + " # if: column named \"PRES\" exists in `monthly_cases` and contains at least one non-missing value\n", + " if (\"PRES\" %in% names(monthly_cases) && !all(is.na(monthly_cases$PRES))) {\n", + " monthly_cases <- monthly_cases %>%\n", + " mutate(N1 = CONF + (PRES * TPR))\n", + " log_msg(\"ℹ️ Calculating N1 as `N1 = CONF + (PRES * TPR)`\")\n", + " } else {\n", + " log_msg(\"🚨 Warning: 'PRES' not found in routine data or contains all `NA` values! 🚨 Calculating N1 using 'SUSP-TEST' method instead.\")\n", + " monthly_cases <- monthly_cases %>%\n", + " mutate(N1 = CONF + ((SUSP - TEST) * TPR))\n", + " }\n", + "} else {\n", + " log_msg(\"Invalid N1_METHOD. Please use 'PRES' or 'SUSP-TEST'.\") # not really necessary ... \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9543d283", + "metadata": {}, + "source": [ + "#### 3.1.2. Calculate **N2**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5060017a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate N2\n", + "monthly_cases <- monthly_cases %>%\n", + " mutate(\n", + " N2 = ifelse(REPORTING_RATE == 0, NA_real_, N1 / REPORTING_RATE) # On the fly convert `RR == 0` to NA to avoid N2 == Inf\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "debb1745-5066-4126-8a15-853b21ee8776", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Log msg about zero REPORTING RATE cases and warn that N2 set to NA\n", + "\n", + "zero_reporting <- reporting_rate_data %>%\n", + " filter(REPORTING_RATE == 0) %>%\n", + " summarise(\n", + " n_months_zero_reporting = n(),\n", + " affected_zones = n_distinct(ADM2_ID)\n", + " )\n", + "\n", + "if (zero_reporting$n_months_zero_reporting > 0) { \n", + " log_msg(glue(\"🚨 Note: {zero_reporting$n_months_zero_reporting} rows had `REPORTING_RATE == 0` across \",\n", + " \"{zero_reporting$affected_zones} ADM2. These N2 values were set to NA.\"))\n", + "} else {\n", + " log_msg(\"✅ Note: no ADM2 has `REPORTING_RATE == 0`. All N2 values were preserved.\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "053f854f-ee8c-40a3-abd6-82d69cc7dca4", + "metadata": {}, + "source": [ + "#### 3.1.3. (optional) Compute **N3** with adjusted **N2** by 'care seeking data file' (csv)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a306e23-93b7-4b8d-84ec-e4afe413a613", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!is.null(care_seeking_data_f)) {\n", + " monthly_cases <- monthly_cases %>%\n", + " left_join(., care_seeking_data_f %>% select(ADM1_ID, PCT), by = c(\"ADM1_ID\")) %>%\n", + " mutate(\n", + " N3 = N2 / PCT\n", + " ) %>% \n", + " select(-PCT)\n", + " log_msg(\"N2 adjusted by care seeking data (NER Specific).\")\n", + " # head(monthly_cases)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "dd9b5677", + "metadata": {}, + "source": [ + "#### 3.1.4. (optional) Calculate **N3**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7aa926ed-99ea-474c-988e-8151d6b12002", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Only calculate N3 if CARESEEKING data is avaiable \n", + "if (!is.null(careseeking_data)) {\n", + " monthly_cases <- monthly_cases %>%\n", + " mutate(YEAR = as.numeric(YEAR)) %>% # keep as safety\n", + " left_join(., careseeking_data, by = c(\"ADM1_ID\")) %>%\n", + " mutate(\n", + " N3 = N2 + (N2 * PCT_PRIVATE_CARE / PCT_PUBLIC_CARE) + (N2 * PCT_NO_CARE / PCT_PUBLIC_CARE) \n", + " )\n", + "} else {\n", + " print(\"🦘 Careseeking data not available, skipping calculation of N3.\")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a67ddc0e-40ea-41d7-9fef-5e05d9594956", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(monthly_cases, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "fb4214ba", + "metadata": {}, + "source": [ + "#### 💾 Export `monthly_cases` (for 📓report notebook)\n", + "For coherence checks, which need monthly resolution ... !" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d80a7cb6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Save monthly_cases as .parquet file \n", + "file_path <- file.path(INTERMEDIATE_DATA_PATH, paste0(COUNTRY_CODE, \"_monthly_cases.parquet\"))\n", + "arrow::write_parquet(monthly_cases, file_path)\n", + "\n", + "# Log msg\n", + "log_msg(glue(\"Monthly cases data saved to: {file_path}\"))\n", + "head(monthly_cases)" + ] + }, + { + "cell_type": "markdown", + "id": "7b50302e-20af-4fa6-8e8c-1e3a6c763ea2", + "metadata": {}, + "source": [ + "### 🔍 Data **coherence** checks on **monthly cases**\n", + "Check for ratios or differences that will cause negative values -> which will causes adjusted incidence to be lower than the values it adjust\n", + "\n", + "\n", + "Namely, the following relationships among INDICATORs:\n", + "* SUSP-TEST\n", + "* CONF/TEST\n", + "* N1 == CONF ... (when PRES == 0)" + ] + }, + { + "cell_type": "markdown", + "id": "e9f7ae73-46f6-4c78-9bb2-fdcfbd591b10", + "metadata": {}, + "source": [ + "#### 1. `PRES == 0`: causes `N1 == CONF` \n", + "(if `N1_METHOD == \"PRES\"`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "495fe18a-50ad-4eff-8669-135c63a7c8dd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Run this check only if N1_METHOD == \"PRES\" (else, problem doesn't exist)\n", + "if (N1_METHOD == \"PRES\") {\n", + " nr_of_pres_0_adm2_month <- monthly_cases |> filter(PRES == 0) |> nrow()\n", + " log_msg(glue(\"🚨 Note: using `PRES` for incidence adjustement, but `PRES == 0` for {nr_of_pres_0_adm2_month} rows (ADM2 x MONTH).\"), \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e12e744b-540e-462c-a16e-edbb05ddc047", + "metadata": {}, + "source": [ + "#### 2. `SUSP-TEST`: if negative, then N1 smaller or equal to CONF (ADJ =< CRUDE)\n", + "(if `N1_METHOD == \"SUSP-TEST\"`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49de98f1-2424-440e-922f-72d7702dd894", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# SUSP - TEST: if negative (TEST > SUSP), then N1 smaller or equal to CONF, which then causes ADJ ≤ CRUDE\n", + "if (N1_METHOD == \"SUSP-TEST\") {\n", + " nr_of_negative <- monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() \n", + " if (nr_of_negative > 0) {\n", + " log_msg(\n", + " glue(\"🚨 Note: using formula `SUSP - TEST` for incidence adjustement, but higher tested than suspected cases (`SUSP < TEST`) detected in {nr_of_negative} rows (ADM2 x MONTH).\"),\n", + " \"warning\"\n", + " )\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "d72d7545-9afa-45c6-9efd-2619aecfc794", + "metadata": {}, + "source": [ + "#### 3. `CONF/TEST` = `TPR` (to calculate N1: Incidence adjusted for **Testing**)\n", + "This **ratio should** always be **≤ 1** because **there should _not_ be more confirmed cases than tested** ...\n", + "\n", + "(but if very small, then N1 could be smaller or equal to CONF (so ADJ INC ≤ CRUDE))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cc60295-5046-4932-b332-965fd320f72e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "more_confirmed_than_tested <- monthly_cases |> mutate(CONF_divby_TEST = CONF / TEST) |> filter(CONF_divby_TEST > 1) |> nrow() \n", + "\n", + "if (more_confirmed_than_tested > 0) {\n", + " log_msg(glue(\"🚨 Note: higher confirmed than tested cases (`CONF/TEST`) detected in {more_confirmed_than_tested} rows (ADM2 x MONTH).\"), \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "acbabb99-07ce-4054-a702-2d3cd59c328e", + "metadata": {}, + "source": [ + "### 3.2 **Yearly incidence**\n", + "After calculating N1 and N2 for each `ADM2`-`MONTH`, we aggregate the data annually to compute the yearly totals (sums) for crude cases (`CONF`), `N1` and `N2`. Finally, we compute:\n", + "* Crude incidence: C / POP × 1000\n", + "* Incidence adjusted for testing: N1 / POP × 1000\n", + "* Incidence adjusted for testing and reporting: N2 / POP × 1000\n", + "* Incidence adjusted for testing, reporting and careseeking behaviour (optional): N3 / POP × 1000\n", + "\n", + "--------------" + ] + }, + { + "cell_type": "markdown", + "id": "d47a3908-71cb-4e79-8771-f6caceae4ce2", + "metadata": {}, + "source": [ + "The calculation expects (input):\n", + "* **monthly_cases**: as the output of `calculate_monthly_cases()`, or a tibble/data frame with the following cols: `ADM2`, `YEAR`, `MONTH`, \"value_\" * (CONF, TEST, SUSP, PRES), `TPR`, `N1`, `N2` \n", + "* **population_data**: df of population data formatted and aligned, aggregated at ADM2 and YEAR level. A tibble/data frame that _must_ contain the following cols: `ADM2`, `YEAR`, `POPULATION`\n", + "\n", + "The calculation produces (output): \n", + "* a data frame with the following cols: ADM2_ID, YEAR, CONF, N1, N2, `INCIDENCE_CRUDE`, `INCIDENCE_ADJ_TESTING`, `INCIDENCE_ADJ_REPORTING`\n", + "\n", + "--------------------" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8753721-067b-4da3-8305-8d98f823454f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ---- 1. Enforce column types upfront ----\n", + "monthly_cases <- monthly_cases %>% \n", + " mutate(across(where(is.numeric), as.numeric)) # Convert all numeric columns\n", + " \n", + "population_data <- dhis2_population_adm2 %>% # population_data\n", + " mutate(across(c(YEAR, POPULATION), as.numeric))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4a9ea81-4f7c-4505-8847-07de13831a42", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ---- 2. Core calculation ----\n", + "yearly_incidence <- monthly_cases %>%\n", + " group_by(ADM2_ID, YEAR) %>%\n", + " summarise(\n", + " # 🚨 removed `na.rm = TRUE` on 20250702 - if things break check here! 🚨 \n", + " across(c(CONF, N1, N2), ~sum(.)), #, na.rm = TRUE)), # 🔍 PROBLEM: if NA's in N2 (due to missing RR data), the sum of N2 by YEAR is smaller than the sum of N1 !\n", + " # across(any_of(c(\"CONF\", \"TEST\", \"SUSP\", \"PRES\", \"N1\", \"N2\")), ~sum(.)), # silenced as not necessary to also summarize \"TEST\", \"SUSP\", \"PRES\"\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " left_join(\n", + " population_data,\n", + " by = c(\"ADM2_ID\", \"YEAR\")\n", + " ) %>%\n", + " mutate(\n", + " INCIDENCE_CRUDE = CONF / POPULATION * 1000,\n", + " INCIDENCE_ADJ_TESTING = N1 / POPULATION * 1000,\n", + " INCIDENCE_ADJ_REPORTING = N2 / POPULATION * 1000\n", + " ) |>\n", + " ungroup()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c712f4c2-d677-4d22-8298-111aa0a93034", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ---- 3.1 Optional careseeking data CSV adjustment ---- \n", + "if (!is.null(care_seeking_data_f) && \"N3\" %in% names(monthly_cases)) {\n", + " n3_data <- monthly_cases %>%\n", + " group_by(ADM2_ID, YEAR) %>%\n", + " summarise(N3 = sum(N3, na.rm = TRUE),\n", + " .groups = \"drop\") |>\n", + " ungroup()\n", + " \n", + " yearly_incidence <- yearly_incidence %>%\n", + " left_join(n3_data, by = c(\"ADM2_ID\", \"YEAR\")) %>%\n", + " mutate(\n", + " INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000\n", + " )\n", + " } else {\n", + " yearly_incidence <- yearly_incidence |>\n", + " mutate(\n", + " INCIDENCE_ADJ_CARESEEKING = NA\n", + " )\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7602a82b-8829-4613-8961-61c419073269", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ---- 3.2 Optional careseeking adjustment ----\n", + "if (is.null(care_seeking_data_f)) { # quick fix\n", + " \n", + " if (!is.null(careseeking_data) && \"N3\" %in% names(monthly_cases)) {\n", + " n3_data <- monthly_cases %>%\n", + " group_by(ADM2_ID, YEAR) %>%\n", + " summarise(N3 = sum(N3, na.rm = TRUE),\n", + " .groups = \"drop\") |>\n", + " ungroup()\n", + " \n", + " yearly_incidence <- yearly_incidence %>%\n", + " left_join(n3_data, by = c(\"ADM2_ID\", \"YEAR\")) %>%\n", + " mutate(\n", + " INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000\n", + " )\n", + " } else {\n", + " yearly_incidence <- yearly_incidence |>\n", + " mutate(\n", + " INCIDENCE_ADJ_CARESEEKING = NA\n", + " )\n", + " }\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56001b15-f74e-42d9-bfa2-bd5563b6a512", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(yearly_incidence, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "7976f894-daf4-46c3-9fa1-5303cbba0818", + "metadata": {}, + "source": [ + "### 🔍 Data **coherence** checks on **yearly incidence**\n", + "Here we check if values of Indicidence (already at `YEAR` resolution) make sense in relation to each other.
\n", + "Namely:\n", + "* crude values should be the lowest, and any consecutive **adjustment** should cause the incidence values to **increase** or remain the **same** - but should never be lower!" + ] + }, + { + "cell_type": "markdown", + "id": "d3dfac34-86f5-4f8c-add9-f54485259924", + "metadata": {}, + "source": [ + "#### 1. `INCIDENCE_ADJ_TESTING` (adj. level 1) should always be greater than `INCIDENCE_CRUDE` (not adjusted)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acb03778-f1db-4f28-9b09-3cd8d815f976", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# same as below but different cols ... \n", + "# Count TRUE values, handling potential NAs in the result of if_else\n", + "nr_of_impossible_values <- yearly_incidence |>\n", + " mutate(IMPOSSIBLE_VALUE = if_else(INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE, TRUE, FALSE)) |>\n", + " pull(IMPOSSIBLE_VALUE) |>\n", + " sum(na.rm = TRUE) \n", + "\n", + "# Warning if any impossible values are found\n", + "if (nr_of_impossible_values > 0) {\n", + " log_msg(glue::glue(\"🚨 Warning: found {nr_of_impossible_values} rows where INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE!\"), \"warning\")\n", + "} else log_msg(\"✅ For all YEAR and ADM2, `INCIDENCE_CRUDE` is smaller than `INCIDENCE_ADJ_TESTING` (as expected).\")\n", + "\n", + "# Check if all values in a column are NA\n", + "if (all(is.na(yearly_incidence$INCIDENCE_ADJ_TESTING))) {\n", + " log_msg(\"🚨 Warning: all values of `INCIDENCE_ADJ_TESTING` are `NA`s\", \"warning\")\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "827d1e84-7f43-404c-88cc-9b675bfa48a1", + "metadata": {}, + "source": [ + "#### 2. `INCIDENCE_ADJ_REPORTING` (adj. level 2) should always be greater than `INCIDENCE_ADJ_TESTING` (adj. level 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b1da976-b157-4e94-b5e9-8795e87bb416", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Count TRUE values, handling potential NAs in the result of if_else\n", + "nr_of_impossible_values <- yearly_incidence |>\n", + " mutate(IMPOSSIBLE_VALUE = if_else(INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING, TRUE, FALSE)) |>\n", + " pull(IMPOSSIBLE_VALUE) |>\n", + " sum(na.rm = TRUE) \n", + "\n", + "# Warning if any impossible values are found\n", + "if (nr_of_impossible_values > 0) {\n", + " log_msg(glue::glue(\"🚨 Warning: found {nr_of_impossible_values} rows where INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING!\"), \"warning\")\n", + "} else log_msg(\"✅ For all YEAR and ADM2, `INCIDENCE_ADJ_TESTING` is smaller than `INCIDENCE_ADJ_REPORTING` (as expected).\")\n", + "\n", + "# Check if all values in a column are NA\n", + "if (all(is.na(yearly_incidence$INCIDENCE_ADJ_REPORTING))) {\n", + " log_msg(\"🚨 Warning: all values of `INCIDENCE_ADJ_REPORTING` are `NA`s\", \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3e57f2e8-1ccc-417c-9fa6-e6b1976336bc", + "metadata": {}, + "source": [ + "## 4. Export to `/data/dhis2_incidence/` folder" + ] + }, + { + "cell_type": "markdown", + "id": "5b6861bb", + "metadata": {}, + "source": [ + "### 4.0. Keep only essential cols \n", + "Based on [SNT Pipelines Data glossary](https://docs.google.com/spreadsheets/d/1qvZMsmCWU6cVLgGZTEXsd5xmoecIxb4LAd-g_2qzYdw/edit?usp=sharing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4085515", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "yearly_incidence <- yearly_incidence |>\n", + "select(\n", + " YEAR, \n", + " starts_with(\"ADM\"),\n", + " starts_with(\"POPULATION\"),\n", + " starts_with(\"INCIDENCE\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e432998-bf85-4706-bea4-8684b0b58c16", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Reusable function to generate filename and save data\n", + "save_yearly_incidence <- function(yearly_incidence, data_path, file_extension, write_function) {\n", + " \n", + " base_name_parts <- c(COUNTRY_CODE, \"_incidence\")\n", + " \n", + " # Concatenate all parts to form the final filename\n", + " file_name <- paste0(c(base_name_parts, file_extension), collapse = \"\")\n", + " # file_path <- file.path(data_path, \"incidence\", file_name)\n", + " file_path <- file.path(data_path, file_name)\n", + " output_dir <- dirname(file_path)\n", + "\n", + " # Check if the output directory exists, else create it\n", + " if (!dir.exists(output_dir)) {\n", + " dir.create(output_dir, recursive = TRUE)\n", + " }\n", + "\n", + " # Flexibility to use function as provided in argument: \"write_csv\" or \"arrow::write_parquet\" ... \n", + " write_function(yearly_incidence, file_path)\n", + "\n", + " log_msg(paste0(\"Exporting : \", file_path))\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3ed49106-b335-42c3-9511-ffd864dd50f0", + "metadata": {}, + "source": [ + "#### 👥 Population Disaggregation logic \n", + "\n", + "Provide a msg to the user to indicate that the results correspond to a specific version of indicators and population (under5, pregnant or totals)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f15c0c4-74d9-4280-ba46-8add435a9147", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# if (COUNTRY_CODE == \"NER\" & INDICATORS_FOUND) {\n", + "if (INDICATORS_FOUND) {\n", + " log_msg(glue(\"ℹ️ The results have been computed using the following Indicators: {paste(target_colnames, collapse=', ')}\"))\n", + " log_msg(glue(\"ℹ️ The results have been computed using the following Population: {POPULATION_SELECTION}\"))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16e7d83b-3962-4041-9d2d-aaa362b62d5f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Export the data\n", + "\n", + "# CSV\n", + "save_yearly_incidence(yearly_incidence, DATA_PATH, \".csv\", write_csv)\n", + "\n", + "# Parquet\n", + "save_yearly_incidence(yearly_incidence, DATA_PATH, \".parquet\", arrow::write_parquet)" + ] } - }, - "outputs": [], - "source": [ - "# function**\n", - "# Define dataset and file name (based on paramter)\n", - "rr_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "file_name_de <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", - "file_name_ds <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", - "\n", - "# Try loading dataelement reporting rates.\n", - "reporting_rate_month <- tryCatch({\n", - " df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_de)\n", - " log_msg(glue(\"Reporting Rate data: `{file_name_de}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df), collapse=', ')}\"))\n", - " REPORTING_RATE_METHOD <- \"dataelement\"\n", - " df_loaded\n", - "}, \n", - " error = function(e) { \n", - " cat(glue(\"[ERROR] Error while loading Reporting Rate 'dataelement' version for: {COUNTRY_CODE} {conditionMessage(e)}\"))\n", - " return(NULL)\n", - "})\n", - "\n", - "# Try loading dataset reporting rates.\n", - "if (is.null(reporting_rate_month)) {\n", - " reporting_rate_month <- tryCatch({\n", - " df_loaded <- get_latest_dataset_file_in_memory(rr_dataset_name, file_name_ds) \n", - " log_msg(glue(\"Reporting Rate data: `{file_name_ds}` loaded from dataset: `{rr_dataset_name}`. Dataframe dimensions: {paste(dim(df), collapse=', ')}\"))\n", - " REPORTING_RATE_METHOD <- \"dataset\"\n", - " df_loaded\n", - " }, \n", - " error = function(e) { \n", - " stop(glue(\"[ERROR] Error while loading Reporting Rate 'dataset' version for: {COUNTRY_CODE} {conditionMessage(e)}\")) # raise error\n", - " })\n", - "}\n", - "\n", - "rm(df_loaded)\n", - "dim(reporting_rate_month)\n", - "head(reporting_rate_month, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "9d2529ad-8436-43c4-85b3-ed1ad9621e1e", - "metadata": {}, - "source": [ - "#### 🔍 Checkon data completeness for `REPORTING_RATE` data\n", - "Normally we should have \"complete\" data (no missing or `NA` values). However, when using certain datasets (from pipeline: \"Reporting Rate (Dataset)\") we might have incomplete coverage and hence `NA`s ...
\n", - "These are \"problematic\" because **N2** (Incidence adj 2) will also have `NA` values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eae2a67f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check on data completeness for REPORTING RATE data: \n", - "# check how many values of REPORTING_RATE are NA\n", - "na_count <- sum(is.na(reporting_rate_month$REPORTING_RATE)) \n", - "if (na_count > 0) {\n", - " log_msg(glue(\"⚠️ Warning: Reporting Rate data contains {na_count} missing values (NA) in 'REPORTING_RATE' column.\"), \"warning\")\n", - "} else {\n", - " log_msg(\"✅ Reporting Rate data contains no missing values (NA) in 'REPORTING_RATE' column.\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9cfa7211-5595-4ed6-9699-0f35aebcbc09", - "metadata": {}, - "source": [ - "### 2.5. Load Care seeking data (file) \n", - "\n", - "Load if available" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "94ede0e7-e0a8-4e06-ad6c-485869b6d4a9", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if the file exist, and try loading it..\n", - "if (file.exists(file.path(SNT_ROOT_PATH, \"uploads/care_seeking_ADM1.csv\"))) {\n", - " \n", - " care_seeking_data_f <- tryCatch({ read.csv(file.path(SNT_ROOT_PATH, \"uploads/care_seeking_ADM1.csv\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading Care Seeking data (NER)\", conditionMessage(e)) \n", - " log_msg(msg)\n", - " stop(msg) \n", - " })\n", - "\n", - " log_msg(\"Care seeking data file loaded: 'uploads/care_seeking_ADM1.csv' (NER Specific).\")\n", - " \n", - " # ensure numeric\n", - " care_seeking_data_f$PCT <- as.numeric(care_seeking_data_f$PCT) \n", - " \n", - "} else {\n", - " care_seeking_data_f <- NULL\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46abe114-0f56-48df-bae7-44147602027c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "care_seeking_data_f" - ] - }, - { - "cell_type": "markdown", - "id": "06f0ebcc-6b87-4d77-98ef-7b8d84be6a0a", - "metadata": {}, - "source": [ - "-------------------------------" - ] - }, - { - "cell_type": "markdown", - "id": "9943c1e5-4d95-4210-8b77-09c4085a96b8", - "metadata": {}, - "source": [ - "## 3. Calculate Incidence\n", - "First calculate monthly cases, then yearly incidence." - ] - }, - { - "cell_type": "markdown", - "id": "8769a974-de8a-4a1f-8f74-edb318a28060", - "metadata": {}, - "source": [ - "### 3.1 **Monthly cases**\n", - "\n", - "\n", - "These methods follow the standard WHO approach for estimating malaria incidence from routine health information systems (WHO, 2023).\n", - "As shown in the code, we begin by calculating **monthly malaria case metrics** (confirmed, tested, presumed) at the **ADM2** level and join them with the **monthly reporting rate**. \n", - "\n", - "This allows us to compute the **test positivity rate** (TPR, where `TPR` = `CONF` / `TEST`) and adjust for incomplete testing using the formula: \n", - "> **N1** = `CONF` + (`PRES` × `CONF` / `TEST`)\n", - "\n", - "Which is equivalent to:\n", - "> **N1** = `CONF` + (`PRES` × **TPR**)\n", - "\n", - "where:\n", - "- **N1** = cases adjusted for testing gaps \n", - "- `CONF` = **confirmed** cases\n", - "- `PRES` = **presumed** cases (either `SUSP` - `TEST` or directly available as `PRES`) 👈 this is a parameter (`N1_METHOD`)\n", - "- `TEST` = **tested** cases \n", - "- **TPR** = Test Positivity Rate (`CONF` / `TEST`)\n", - " \n", - "This produces `N1`, the number of cases adjusted for testing gaps, calculated at the monthly level in line with WHO recommendations to capture intra-annual variation.\n", - "\n", - "Next, we adjust for incomplete reporting using: \n", - "> **N2** = **N1** / `REPORTING_RATE`\n", - "\n", - "where `REPORTING_RATE` is at the monthly levele, and is the ratio of received reports (submission to DHIS2) divided by the expected reports.\n", - "\n", - "Finally, _if_ **careseeking** data is **available**, N3 is calculated as follows:\n", - "> **N3** = N2 + (N2 * PROP_PRIV / PROP_PUBL) + (N2 * NO_TREAT / PROP_PUBL)\n", - "\n", - "where:\n", - "- PRIVATE_CARE = proportion of kids treated in the **private** sector\n", - "- PUBLIC_CARE = proportion of kids treated in the **public** sector\n", - "- NO_CARE = proportion of kids which **did not receive any treatment**\n", - "\n", - "Note that this assumes the same TPR across all sectors (private and public).\n", - "\n", - "\n", - "\n", - "**Important note**
\n", - "In case reporting rate equals zero (none of the health facilities reported in a given month), N2 is set to `NA`. Note that the annual N2 will be underestimated, which is preferable compared to having `Inf` values.\n", - "\n", - "-------------" - ] - }, - { - "cell_type": "markdown", - "id": "dcee32af-ae6d-4b2a-9c7a-f846209f1dc3", - "metadata": {}, - "source": [ - "This calculation expects (input):\n", - "* **routine_data**: DHIS2 routine data, formatted and aggregated at ADM2 and MONTH level. Tibble (df) _must_ contain the following cols: `YEAR`, `MONTH`, `ADM2`, `CONF`, `TEST`, `SUSP`, `PRES`. \n", - "* **reporting_rate_data**: reporting rate calculated at ADM2 and MONTH level and expressed as proprtion **(0-1)**. Tibble (df) _must_ contain the following cols: `ADM2`, `YEAR`, `MONTH`, `reporting_rate`\n", - "\n", - "The calculation produces (output):\n", - "* data frame with the following cols: `ADM2`, `YEAR`, `MONTH`, \"value_\" * (`CONF`, `TEST`, `SUSP`, `PRES`), `TPR`, `N1`, `N2`\n", - "\n", - "-----------------" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1a0899a-3308-4d90-b06e-8a0cd4b849e1", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Ensure correct data type for numerical columns ---------------------------------------\n", - "routine_data <- dhis2_routine %>%\n", - " mutate(across(any_of(c(\"YEAR\", \"MONTH\", \"CONF\", \"TEST\", \"SUSP\", \"PRES\")), as.numeric))\n", - "\n", - "reporting_rate_data <- reporting_rate_month %>% # reporting_rate_data\n", - " mutate(across(c(YEAR, MONTH, REPORTING_RATE), as.numeric))" - ] - }, - { - "cell_type": "markdown", - "id": "736dec8f", - "metadata": {}, - "source": [ - "#### 3.1.0. Aggregate at `ADM2` x `MONTH` & calculate **TPR**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8899964a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check for TEST > SUSP\n", - "routine_data |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58bf219e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Group & compute TPR\n", - "monthly_cases <- routine_data %>%\n", - " group_by(ADM1_ID, ADM2_ID, YEAR, MONTH) %>% # ADM1 needed to join careseeking data\n", - " summarise(\n", - " CONF = sum(CONF, na.rm = TRUE),\n", - " TEST = sum(TEST, na.rm = TRUE),\n", - " SUSP = sum(SUSP, na.rm = TRUE),\n", - " across(any_of(\"PRES\"), ~sum(., na.rm = TRUE), .names = \"PRES\"), # <- handles missing 'PRES' column gracefully\n", - " .groups = \"drop\") %>%\n", - " # Cleaning TEST data for \"SUSP-TEST\" method\n", - " mutate(TEST = ifelse(N1_METHOD == \"SUSP-TEST\" & !is.na(SUSP) & (TEST > SUSP), SUSP, TEST)) %>%\n", - " left_join(reporting_rate_data,\n", - " by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>% \n", - " # Calculate TPR based on CONF and TEST\n", - " # Note: if TEST is 0 or NA, set TPR = 1 (to avoid division by zero which produces Inf)\n", - " mutate( \n", - " TPR = ifelse(!is.na(CONF) & !is.na(TEST) & (TEST != 0), CONF / TEST, 1)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "938f0194", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check for TEST > SUSP\n", - "monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() " - ] - }, - { - "cell_type": "markdown", - "id": "df43d6d8", - "metadata": {}, - "source": [ - "#### 3.1.1. Calculate **N1**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ead591bb-3936-486d-bb9e-7b01d0805d0d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Calculate N1 based on `N1_METHOD` & availability of `PRES` \n", - "\n", - "if (N1_METHOD == \"SUSP-TEST\") {\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(N1 = CONF + ((SUSP - TEST) * TPR))\n", - " log_msg(\"Calculating N1 as `N1 = CONF + ((SUSP - TEST) * TPR)`\")\n", - "} else if (N1_METHOD == \"PRES\") {\n", - " # if: column named \"PRES\" exists in `monthly_cases` and contains at least one non-missing value\n", - " if (\"PRES\" %in% names(monthly_cases) && !all(is.na(monthly_cases$PRES))) {\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(N1 = CONF + (PRES * TPR))\n", - " log_msg(\"ℹ️ Calculating N1 as `N1 = CONF + (PRES * TPR)`\")\n", - " } else {\n", - " log_msg(\"🚨 Warning: 'PRES' not found in routine data or contains all `NA` values! 🚨 Calculating N1 using 'SUSP-TEST' method instead.\")\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(N1 = CONF + ((SUSP - TEST) * TPR))\n", - " }\n", - "} else {\n", - " log_msg(\"Invalid N1_METHOD. Please use 'PRES' or 'SUSP-TEST'.\") # not really necessary ... \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9543d283", - "metadata": {}, - "source": [ - "#### 3.1.2. Calculate **N2**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5060017a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Calculate N2\n", - "monthly_cases <- monthly_cases %>%\n", - " mutate(\n", - " N2 = ifelse(REPORTING_RATE == 0, NA_real_, N1 / REPORTING_RATE) # On the fly convert `RR == 0` to NA to avoid N2 == Inf\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "debb1745-5066-4126-8a15-853b21ee8776", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Log msg about zero REPORTING RATE cases and warn that N2 set to NA\n", - "\n", - "zero_reporting <- reporting_rate_data %>%\n", - " filter(REPORTING_RATE == 0) %>%\n", - " summarise(\n", - " n_months_zero_reporting = n(),\n", - " affected_zones = n_distinct(ADM2_ID)\n", - " )\n", - "\n", - "if (zero_reporting$n_months_zero_reporting > 0) { \n", - " log_msg(glue(\"🚨 Note: {zero_reporting$n_months_zero_reporting} rows had `REPORTING_RATE == 0` across \",\n", - " \"{zero_reporting$affected_zones} ADM2. These N2 values were set to NA.\"))\n", - "} else {\n", - " log_msg(\"✅ Note: no ADM2 has `REPORTING_RATE == 0`. All N2 values were preserved.\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "053f854f-ee8c-40a3-abd6-82d69cc7dca4", - "metadata": {}, - "source": [ - "#### 3.1.3. (optional) Compute **N3** with adjusted **N2** by 'care seeking data file' (csv)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a306e23-93b7-4b8d-84ec-e4afe413a613", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!is.null(care_seeking_data_f)) {\n", - " monthly_cases <- monthly_cases %>%\n", - " left_join(., care_seeking_data_f %>% select(ADM1_ID, PCT), by = c(\"ADM1_ID\")) %>%\n", - " mutate(\n", - " N3 = N2 / PCT\n", - " ) %>% \n", - " select(-PCT)\n", - " log_msg(\"N2 adjusted by care seeking data (NER Specific).\")\n", - " # head(monthly_cases)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "dd9b5677", - "metadata": {}, - "source": [ - "#### 3.1.4. (optional) Calculate **N3**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7aa926ed-99ea-474c-988e-8151d6b12002", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Only calculate N3 if CARESEEKING data is avaiable \n", - "if (!is.null(careseeking_data)) {\n", - " monthly_cases <- monthly_cases %>%\n", - " mutate(YEAR = as.numeric(YEAR)) %>% # keep as safety\n", - " left_join(., careseeking_data, by = c(\"ADM1_ID\")) %>%\n", - " mutate(\n", - " N3 = N2 + (N2 * PCT_PRIVATE_CARE / PCT_PUBLIC_CARE) + (N2 * PCT_NO_CARE / PCT_PUBLIC_CARE) \n", - " )\n", - "} else {\n", - " print(\"🦘 Careseeking data not available, skipping calculation of N3.\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a67ddc0e-40ea-41d7-9fef-5e05d9594956", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(monthly_cases, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "fb4214ba", - "metadata": {}, - "source": [ - "#### 💾 Export `monthly_cases` (for 📓report notebook)\n", - "For coherence checks, which need monthly resolution ... !" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d80a7cb6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Save monthly_cases as .parquet file \n", - "file_path <- file.path(INTERMEDIATE_DATA_PATH, paste0(COUNTRY_CODE, \"_monthly_cases.parquet\"))\n", - "arrow::write_parquet(monthly_cases, file_path)\n", - "\n", - "# Log msg\n", - "log_msg(glue(\"Monthly cases data saved to: {file_path}\"))\n", - "head(monthly_cases)" - ] - }, - { - "cell_type": "markdown", - "id": "7b50302e-20af-4fa6-8e8c-1e3a6c763ea2", - "metadata": {}, - "source": [ - "### 🔍 Data **coherence** checks on **monthly cases**\n", - "Check for ratios or differences that will cause negative values -> which will causes adjusted incidence to be lower than the values it adjust\n", - "\n", - "\n", - "Namely, the following relationships among INDICATORs:\n", - "* SUSP-TEST\n", - "* CONF/TEST\n", - "* N1 == CONF ... (when PRES == 0)" - ] - }, - { - "cell_type": "markdown", - "id": "e9f7ae73-46f6-4c78-9bb2-fdcfbd591b10", - "metadata": {}, - "source": [ - "#### 1. `PRES == 0`: causes `N1 == CONF` \n", - "(if `N1_METHOD == \"PRES\"`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "495fe18a-50ad-4eff-8669-135c63a7c8dd", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Run this check only if N1_METHOD == \"PRES\" (else, problem doesn't exist)\n", - "if (N1_METHOD == \"PRES\") {\n", - " nr_of_pres_0_adm2_month <- monthly_cases |> filter(PRES == 0) |> nrow()\n", - " log_msg(glue(\"🚨 Note: using `PRES` for incidence adjustement, but `PRES == 0` for {nr_of_pres_0_adm2_month} rows (ADM2 x MONTH).\"), \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e12e744b-540e-462c-a16e-edbb05ddc047", - "metadata": {}, - "source": [ - "#### 2. `SUSP-TEST`: if negative, then N1 smaller or equal to CONF (ADJ =< CRUDE)\n", - "(if `N1_METHOD == \"SUSP-TEST\"`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49de98f1-2424-440e-922f-72d7702dd894", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# SUSP - TEST: if negative (TEST > SUSP), then N1 smaller or equal to CONF, which then causes ADJ ≤ CRUDE\n", - "if (N1_METHOD == \"SUSP-TEST\") {\n", - " nr_of_negative <- monthly_cases |> mutate(SUSP_minus_TEST = SUSP - TEST) |> filter(SUSP_minus_TEST < 0) |> nrow() \n", - " if (nr_of_negative > 0) {\n", - " log_msg(\n", - " glue(\"🚨 Note: using formula `SUSP - TEST` for incidence adjustement, but higher tested than suspected cases (`SUSP < TEST`) detected in {nr_of_negative} rows (ADM2 x MONTH).\"),\n", - " \"warning\"\n", - " )\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "d72d7545-9afa-45c6-9efd-2619aecfc794", - "metadata": {}, - "source": [ - "#### 3. `CONF/TEST` = `TPR` (to calculate N1: Incidence adjusted for **Testing**)\n", - "This **ratio should** always be **≤ 1** because **there should _not_ be more confirmed cases than tested** ...\n", - "\n", - "(but if very small, then N1 could be smaller or equal to CONF (so ADJ INC ≤ CRUDE))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9cc60295-5046-4932-b332-965fd320f72e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "more_confirmed_than_tested <- monthly_cases |> mutate(CONF_divby_TEST = CONF / TEST) |> filter(CONF_divby_TEST > 1) |> nrow() \n", - "\n", - "if (more_confirmed_than_tested > 0) {\n", - " log_msg(glue(\"🚨 Note: higher confirmed than tested cases (`CONF/TEST`) detected in {more_confirmed_than_tested} rows (ADM2 x MONTH).\"), \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "acbabb99-07ce-4054-a702-2d3cd59c328e", - "metadata": {}, - "source": [ - "### 3.2 **Yearly incidence**\n", - "After calculating N1 and N2 for each `ADM2`-`MONTH`, we aggregate the data annually to compute the yearly totals (sums) for crude cases (`CONF`), `N1` and `N2`. Finally, we compute:\n", - "* Crude incidence: C / POP × 1000\n", - "* Incidence adjusted for testing: N1 / POP × 1000\n", - "* Incidence adjusted for testing and reporting: N2 / POP × 1000\n", - "* Incidence adjusted for testing, reporting and careseeking behaviour (optional): N3 / POP × 1000\n", - "\n", - "--------------" - ] - }, - { - "cell_type": "markdown", - "id": "d47a3908-71cb-4e79-8771-f6caceae4ce2", - "metadata": {}, - "source": [ - "The calculation expects (input):\n", - "* **monthly_cases**: as the output of `calculate_monthly_cases()`, or a tibble/data frame with the following cols: `ADM2`, `YEAR`, `MONTH`, \"value_\" * (CONF, TEST, SUSP, PRES), `TPR`, `N1`, `N2` \n", - "* **population_data**: df of population data formatted and aligned, aggregated at ADM2 and YEAR level. A tibble/data frame that _must_ contain the following cols: `ADM2`, `YEAR`, `POPULATION`\n", - "\n", - "The calculation produces (output): \n", - "* a data frame with the following cols: ADM2_ID, YEAR, CONF, N1, N2, `INCIDENCE_CRUDE`, `INCIDENCE_ADJ_TESTING`, `INCIDENCE_ADJ_REPORTING`\n", - "\n", - "--------------------" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8753721-067b-4da3-8305-8d98f823454f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ---- 1. Enforce column types upfront ----\n", - "monthly_cases <- monthly_cases %>% \n", - " mutate(across(where(is.numeric), as.numeric)) # Convert all numeric columns\n", - " \n", - "population_data <- dhis2_population_adm2 %>% # population_data\n", - " mutate(across(c(YEAR, POPULATION), as.numeric))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4a9ea81-4f7c-4505-8847-07de13831a42", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ---- 2. Core calculation ----\n", - "yearly_incidence <- monthly_cases %>%\n", - " group_by(ADM2_ID, YEAR) %>%\n", - " summarise(\n", - " # 🚨 removed `na.rm = TRUE` on 20250702 - if things break check here! 🚨 \n", - " across(c(CONF, N1, N2), ~sum(.)), #, na.rm = TRUE)), # 🔍 PROBLEM: if NA's in N2 (due to missing RR data), the sum of N2 by YEAR is smaller than the sum of N1 !\n", - " # across(any_of(c(\"CONF\", \"TEST\", \"SUSP\", \"PRES\", \"N1\", \"N2\")), ~sum(.)), # silenced as not necessary to also summarize \"TEST\", \"SUSP\", \"PRES\"\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " left_join(\n", - " population_data,\n", - " by = c(\"ADM2_ID\", \"YEAR\")\n", - " ) %>%\n", - " mutate(\n", - " INCIDENCE_CRUDE = CONF / POPULATION * 1000,\n", - " INCIDENCE_ADJ_TESTING = N1 / POPULATION * 1000,\n", - " INCIDENCE_ADJ_REPORTING = N2 / POPULATION * 1000\n", - " ) |>\n", - " ungroup()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c712f4c2-d677-4d22-8298-111aa0a93034", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ---- 3.1 Optional careseeking data CSV adjustment ---- \n", - "if (!is.null(care_seeking_data_f) && \"N3\" %in% names(monthly_cases)) {\n", - " n3_data <- monthly_cases %>%\n", - " group_by(ADM2_ID, YEAR) %>%\n", - " summarise(N3 = sum(N3, na.rm = TRUE),\n", - " .groups = \"drop\") |>\n", - " ungroup()\n", - " \n", - " yearly_incidence <- yearly_incidence %>%\n", - " left_join(n3_data, by = c(\"ADM2_ID\", \"YEAR\")) %>%\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000\n", - " )\n", - " } else {\n", - " yearly_incidence <- yearly_incidence |>\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = NA\n", - " )\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7602a82b-8829-4613-8961-61c419073269", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ---- 3.2 Optional careseeking adjustment ----\n", - "if (is.null(care_seeking_data_f)) { # quick fix\n", - " \n", - " if (!is.null(careseeking_data) && \"N3\" %in% names(monthly_cases)) {\n", - " n3_data <- monthly_cases %>%\n", - " group_by(ADM2_ID, YEAR) %>%\n", - " summarise(N3 = sum(N3, na.rm = TRUE),\n", - " .groups = \"drop\") |>\n", - " ungroup()\n", - " \n", - " yearly_incidence <- yearly_incidence %>%\n", - " left_join(n3_data, by = c(\"ADM2_ID\", \"YEAR\")) %>%\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = N3 / POPULATION * 1000\n", - " )\n", - " } else {\n", - " yearly_incidence <- yearly_incidence |>\n", - " mutate(\n", - " INCIDENCE_ADJ_CARESEEKING = NA\n", - " )\n", - " }\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56001b15-f74e-42d9-bfa2-bd5563b6a512", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(yearly_incidence, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "7976f894-daf4-46c3-9fa1-5303cbba0818", - "metadata": {}, - "source": [ - "### 🔍 Data **coherence** checks on **yearly incidence**\n", - "Here we check if values of Indicidence (already at `YEAR` resolution) make sense in relation to each other.
\n", - "Namely:\n", - "* crude values should be the lowest, and any consecutive **adjustment** should cause the incidence values to **increase** or remain the **same** - but should never be lower!" - ] - }, - { - "cell_type": "markdown", - "id": "d3dfac34-86f5-4f8c-add9-f54485259924", - "metadata": {}, - "source": [ - "#### 1. `INCIDENCE_ADJ_TESTING` (adj. level 1) should always be greater than `INCIDENCE_CRUDE` (not adjusted)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "acb03778-f1db-4f28-9b09-3cd8d815f976", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# same as below but different cols ... \n", - "# Count TRUE values, handling potential NAs in the result of if_else\n", - "nr_of_impossible_values <- yearly_incidence |>\n", - " mutate(IMPOSSIBLE_VALUE = if_else(INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE, TRUE, FALSE)) |>\n", - " pull(IMPOSSIBLE_VALUE) |>\n", - " sum(na.rm = TRUE) \n", - "\n", - "# Warning if any impossible values are found\n", - "if (nr_of_impossible_values > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: found {nr_of_impossible_values} rows where INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE!\"), \"warning\")\n", - "} else log_msg(\"✅ For all YEAR and ADM2, `INCIDENCE_CRUDE` is smaller than `INCIDENCE_ADJ_TESTING` (as expected).\")\n", - "\n", - "# Check if all values in a column are NA\n", - "if (all(is.na(yearly_incidence$INCIDENCE_ADJ_TESTING))) {\n", - " log_msg(\"🚨 Warning: all values of `INCIDENCE_ADJ_TESTING` are `NA`s\", \"warning\")\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "827d1e84-7f43-404c-88cc-9b675bfa48a1", - "metadata": {}, - "source": [ - "#### 2. `INCIDENCE_ADJ_REPORTING` (adj. level 2) should always be greater than `INCIDENCE_ADJ_TESTING` (adj. level 1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b1da976-b157-4e94-b5e9-8795e87bb416", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Count TRUE values, handling potential NAs in the result of if_else\n", - "nr_of_impossible_values <- yearly_incidence |>\n", - " mutate(IMPOSSIBLE_VALUE = if_else(INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING, TRUE, FALSE)) |>\n", - " pull(IMPOSSIBLE_VALUE) |>\n", - " sum(na.rm = TRUE) \n", - "\n", - "# Warning if any impossible values are found\n", - "if (nr_of_impossible_values > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: found {nr_of_impossible_values} rows where INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING!\"), \"warning\")\n", - "} else log_msg(\"✅ For all YEAR and ADM2, `INCIDENCE_ADJ_TESTING` is smaller than `INCIDENCE_ADJ_REPORTING` (as expected).\")\n", - "\n", - "# Check if all values in a column are NA\n", - "if (all(is.na(yearly_incidence$INCIDENCE_ADJ_REPORTING))) {\n", - " log_msg(\"🚨 Warning: all values of `INCIDENCE_ADJ_REPORTING` are `NA`s\", \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3e57f2e8-1ccc-417c-9fa6-e6b1976336bc", - "metadata": {}, - "source": [ - "## 4. Export to `/data/dhis2_incidence/` folder" - ] - }, - { - "cell_type": "markdown", - "id": "5b6861bb", - "metadata": {}, - "source": [ - "### 4.0. Keep only essential cols \n", - "Based on [SNT Pipelines Data glossary](https://docs.google.com/spreadsheets/d/1qvZMsmCWU6cVLgGZTEXsd5xmoecIxb4LAd-g_2qzYdw/edit?usp=sharing)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4085515", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "yearly_incidence <- yearly_incidence |>\n", - "select(\n", - " YEAR, \n", - " starts_with(\"ADM\"),\n", - " starts_with(\"POPULATION\"),\n", - " starts_with(\"INCIDENCE\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e432998-bf85-4706-bea4-8684b0b58c16", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Reusable function to generate filename and save data\n", - "save_yearly_incidence <- function(yearly_incidence, data_path, file_extension, write_function) {\n", - " \n", - " base_name_parts <- c(COUNTRY_CODE, \"_incidence\")\n", - " \n", - " # Concatenate all parts to form the final filename\n", - " file_name <- paste0(c(base_name_parts, file_extension), collapse = \"\")\n", - " # file_path <- file.path(data_path, \"incidence\", file_name)\n", - " file_path <- file.path(data_path, file_name)\n", - " output_dir <- dirname(file_path)\n", - "\n", - " # Check if the output directory exists, else create it\n", - " if (!dir.exists(output_dir)) {\n", - " dir.create(output_dir, recursive = TRUE)\n", - " }\n", - "\n", - " # Flexibility to use function as provided in argument: \"write_csv\" or \"arrow::write_parquet\" ... \n", - " write_function(yearly_incidence, file_path)\n", - "\n", - " log_msg(paste0(\"Exporting : \", file_path))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3ed49106-b335-42c3-9511-ffd864dd50f0", - "metadata": {}, - "source": [ - "#### 👥 Population Disaggregation logic \n", - "\n", - "Provide a msg to the user to indicate that the results correspond to a specific version of indicators and population (under5, pregnant or totals)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8f15c0c4-74d9-4280-ba46-8add435a9147", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# if (COUNTRY_CODE == \"NER\" & INDICATORS_FOUND) {\n", - "if (INDICATORS_FOUND) {\n", - " log_msg(glue(\"ℹ️ The results have been computed using the following Indicators: {paste(target_colnames, collapse=', ')}\"))\n", - " log_msg(glue(\"ℹ️ The results have been computed using the following Population: {POPULATION_SELECTION}\"))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16e7d83b-3962-4041-9d2d-aaa362b62d5f", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "# Export the data\n", - "\n", - "# CSV\n", - "save_yearly_incidence(yearly_incidence, DATA_PATH, \".csv\", write_csv)\n", - "\n", - "# Parquet\n", - "save_yearly_incidence(yearly_incidence, DATA_PATH, \".parquet\", arrow::write_parquet)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index 5c089b7..e9885f8 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -1,1201 +1,1205 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "30bf8dfc", - "metadata": {}, - "source": [ - "# **Dataset Reporting Rate: Calculation Based on DHIS2 Extracted Data**\n", - "\n", - "The **reporting rate** measures the proportion of registered health facilities that submit data. It is calculated for each administrative level 2 (`ADM2`) area and for each reporting period (`PERIOD` in YYYYMM format).\n", - "
\n", - "\n", - "**Dataset Selection**
\n", - "The choice of dataset(s) used for reporting rate calculation is controlled by modifying the SNT_config.json configuration file. This allows flexible selection among multiple datasets extracted from the same DHIS2 instance.\n", - "\n", - "**Calculation Logic**
\n", - "From the selected dataset(s):\n", - "- **Numerator:** Number of facilities that _actually_ reported, derived from the element \"ACTUAL_REPORTS\".\n", - "- **Denominator:** Number of facilities _expected_ to report, derived from the element \"EXPECTED_REPORTS\".\n", - "\n", - "After aggregating these counts at the ADM2 level, the reporting rate is computed as:\n", - "
\n", - "REPORTING RATE = ACTUAL_REPORTS / EXPECTED_REPORTS\n", - "
\n", - "and expressed as a **proportion** between 0 and 1.\n", - "
\n", - "\n", - "-----\n", - "\n", - "### Additional Data Processing Steps\n", - "\n", - "- **Handling Multiple Datasets:** \n", - " When multiple datasets are available, the pipeline uses only those specified in SNT_config.json. For these selected datasets, the counts of actual and expected reports are summed by ADM2 area.\n", - "\n", - "- **Deduplication of Entries:** \n", - " Sometimes, the same organizational unit (OU_ID) may appear in multiple datasets for the same period, risking double counting. To address this, deduplication is performed by keeping only the entry with the **highest** ACTUAL_REPORTS value for each unique combination of OU_ID and PERIOD. \n", - "
    \n", - "
  • Why keep the highest? Because ACTUAL_REPORTS values are binary (0 or 1). If duplicates agree (all 0 or all 1), keeping one suffices. If they differ (some 0, some 1), keeping the 1 ensures that presence of a report is not missed.
  • \n", - "
  • 🚨Important: Deduplication only proceeds if all duplicated values are within {0,1}. If other values are present, deduplication is skipped with a warning to avoid incorrect data handling.
  • \n", - "
\n", - "\n", - "-----\n", - "\n", - "\n", - "### 🇳🇪 Niger-Specific Processing: \n", - " In Niger, datasets for HOP (hospital) facilities are already **pre-aggregated** and may contain values greater than 1 for actual or expected reports, reflecting subunits or departments within a hospital. \n", - "
\n", - " To accurately represent reporting at the facility level and avoid overcounting, all values greater than 1 are converted to 1 (presence/absence). This ensures that the reporting rate reflects whether the hospital as a whole reported, rather than counting multiple subunits separately. This step also prevents cases where ACTUAL_REPORTS exceeds EXPECTED_REPORTS.\n", - "\n", - "------\n", - "\n", - "### Pipeline parameters\n", - "\n", - "- **Outliers detection method**: Specify which method was used to detect outliers in routine data. Choose \"Routine data (Raw)\" to use raw routine data.\n", - " \n", - "- **Use routine with outliers removed**: Toggle this on to use the routine data after outliers have been removed (using the outliers detection method selected above). Else, this pipeline will use either the imputed routine data (to replace the outlier values removed) or the raw routine data if you selected \"Routine data (Raw)\" as your choice of “Outlier processing method”." - ] - }, - { - "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", - "metadata": { - "papermill": { - "duration": 0.000092, - "end_time": "2025-12-19T10:21:50.273573", - "exception": false, - "start_time": "2025-12-19T10:21:50.273481", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:21:50.332786Z", - "iopub.status.busy": "2025-12-19T10:21:50.277536Z", - "iopub.status.idle": "2025-12-19T10:23:03.339080Z", - "shell.execute_reply": "2025-12-19T10:23:03.336413Z" - }, - "papermill": { - "duration": 73.068006, - "end_time": "2025-12-19T10:23:03.341764", - "exception": false, - "start_time": "2025-12-19T10:21:50.273758", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"glue\", \"jsonlite\", \"httr\", \"reticulate\") \n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", - "metadata": { - "papermill": { - "duration": 0.00017, - "end_time": "2025-12-19T10:23:03.342235", - "exception": false, - "start_time": "2025-12-19T10:23:03.342065", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.1. Load and check `config_json` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:03.351367Z", - "iopub.status.busy": "2025-12-19T10:23:03.348819Z", - "iopub.status.idle": "2025-12-19T10:23:03.979814Z", - "shell.execute_reply": "2025-12-19T10:23:03.976617Z" - }, - "papermill": { - "duration": 0.640406, - "end_time": "2025-12-19T10:23:03.982829", - "exception": false, - "start_time": "2025-12-19T10:23:03.342423", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:03.987632Z", - "iopub.status.busy": "2025-12-19T10:23:03.985301Z", - "iopub.status.idle": "2025-12-19T10:23:04.011308Z", - "shell.execute_reply": "2025-12-19T10:23:04.008941Z" - }, - "papermill": { - "duration": 0.031002, - "end_time": "2025-12-19T10:23:04.014107", - "exception": false, - "start_time": "2025-12-19T10:23:03.983105", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# Which reporting rate PRODUCT_UID to use (DHIS2 dataset id)\n", - "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID \n", - "\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ] - }, - { - "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", - "metadata": { - "papermill": { - "duration": 0.00015, - "end_time": "2025-12-19T10:23:04.014523", - "exception": false, - "start_time": "2025-12-19T10:23:04.014373", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.2. Validate parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.019283Z", - "iopub.status.busy": "2025-12-19T10:23:04.017257Z", - "iopub.status.idle": "2025-12-19T10:23:04.039652Z", - "shell.execute_reply": "2025-12-19T10:23:04.037292Z" - }, - "papermill": { - "duration": 0.02788, - "end_time": "2025-12-19T10:23:04.042642", - "exception": false, - "start_time": "2025-12-19T10:23:04.014762", - "status": "completed" + "cells": [ + { + "cell_type": "markdown", + "id": "30bf8dfc", + "metadata": {}, + "source": [ + "# **Dataset Reporting Rate: Calculation Based on DHIS2 Extracted Data**\n", + "\n", + "The **reporting rate** measures the proportion of registered health facilities that submit data. It is calculated for each administrative level 2 (`ADM2`) area and for each reporting period (`PERIOD` in YYYYMM format).\n", + "
\n", + "\n", + "**Dataset Selection**
\n", + "The choice of dataset(s) used for reporting rate calculation is controlled by modifying the SNT_config.json configuration file. This allows flexible selection among multiple datasets extracted from the same DHIS2 instance.\n", + "\n", + "**Calculation Logic**
\n", + "From the selected dataset(s):\n", + "- **Numerator:** Number of facilities that _actually_ reported, derived from the element \"ACTUAL_REPORTS\".\n", + "- **Denominator:** Number of facilities _expected_ to report, derived from the element \"EXPECTED_REPORTS\".\n", + "\n", + "After aggregating these counts at the ADM2 level, the reporting rate is computed as:\n", + "
\n", + "REPORTING RATE = ACTUAL_REPORTS / EXPECTED_REPORTS\n", + "
\n", + "and expressed as a **proportion** between 0 and 1.\n", + "
\n", + "\n", + "-----\n", + "\n", + "### Additional Data Processing Steps\n", + "\n", + "- **Handling Multiple Datasets:** \n", + " When multiple datasets are available, the pipeline uses only those specified in SNT_config.json. For these selected datasets, the counts of actual and expected reports are summed by ADM2 area.\n", + "\n", + "- **Deduplication of Entries:** \n", + " Sometimes, the same organizational unit (OU_ID) may appear in multiple datasets for the same period, risking double counting. To address this, deduplication is performed by keeping only the entry with the **highest** ACTUAL_REPORTS value for each unique combination of OU_ID and PERIOD. \n", + "
    \n", + "
  • Why keep the highest? Because ACTUAL_REPORTS values are binary (0 or 1). If duplicates agree (all 0 or all 1), keeping one suffices. If they differ (some 0, some 1), keeping the 1 ensures that presence of a report is not missed.
  • \n", + "
  • 🚨Important: Deduplication only proceeds if all duplicated values are within {0,1}. If other values are present, deduplication is skipped with a warning to avoid incorrect data handling.
  • \n", + "
\n", + "\n", + "-----\n", + "\n", + "\n", + "### 🇳🇪 Niger-Specific Processing: \n", + " In Niger, datasets for HOP (hospital) facilities are already **pre-aggregated** and may contain values greater than 1 for actual or expected reports, reflecting subunits or departments within a hospital. \n", + "
\n", + " To accurately represent reporting at the facility level and avoid overcounting, all values greater than 1 are converted to 1 (presence/absence). This ensures that the reporting rate reflects whether the hospital as a whole reported, rather than counting multiple subunits separately. This step also prevents cases where ACTUAL_REPORTS exceeds EXPECTED_REPORTS.\n", + "\n", + "------\n", + "\n", + "### Pipeline parameters\n", + "\n", + "- **Routine data source**: Select the routine dataset variant used for reporting rate computation.\n", + "\n", + "- **`raw`**: Loads routine data from the formatted dataset.\n", + "\n", + "- **`imputed`**: Loads routine data from the outliers dataset using imputed values.\n", + "\n", + "- **`outliers_removed`**: Loads routine data from the outliers dataset after outliers removal." + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# default: raw routine\n", - "if (!exists(\"ROUTINE_FILE\")) { ROUTINE_FILE <- glue::glue(\"{COUNTRY_CODE}_routine.parquet\") }" - ] - }, - { - "cell_type": "markdown", - "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650", - "metadata": { - "papermill": { - "duration": 0.000144, - "end_time": "2025-12-19T10:23:04.043066", - "exception": false, - "start_time": "2025-12-19T10:23:04.042922", - "status": "completed" + { + "cell_type": "markdown", + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", + "metadata": { + "papermill": { + "duration": 0.000092, + "end_time": "2025-12-19T10:21:50.273573", + "exception": false, + "start_time": "2025-12-19T10:21:50.273481", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 1. Setup" + ] }, - "tags": [] - }, - "source": [ - "#### 1.3. 🔍 Check REPORTING_RATE_PRODUCT_ID is configured" - ] - }, - { - "cell_type": "markdown", - "id": "682a62d5", - "metadata": {}, - "source": [ - "### 🐍 This probably to be moved to pipeline.py code?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7469898d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.047782Z", - "iopub.status.busy": "2025-12-19T10:23:04.045631Z", - "iopub.status.idle": "2025-12-19T10:23:04.545551Z", - "shell.execute_reply": "2025-12-19T10:23:04.542372Z" + { + "cell_type": "code", + "execution_count": null, + "id": "35ede7cf-257f-439c-a514-26a7290f881d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:21:50.332786Z", + "iopub.status.busy": "2025-12-19T10:21:50.277536Z", + "iopub.status.idle": "2025-12-19T10:23:03.339080Z", + "shell.execute_reply": "2025-12-19T10:23:03.336413Z" + }, + "papermill": { + "duration": 73.068006, + "end_time": "2025-12-19T10:23:03.341764", + "exception": false, + "start_time": "2025-12-19T10:21:50.273758", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Project paths\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "\n", + "# Load libraries \n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"glue\", \"jsonlite\", \"httr\", \"reticulate\") \n", + "install_and_load(required_packages)\n", + "\n", + "# Environment variables\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load OpenHEXA sdk\n", + "openhexa <- import(\"openhexa.sdk\")" + ] }, - "papermill": { - "duration": 0.505908, - "end_time": "2025-12-19T10:23:04.549148", - "exception": false, - "start_time": "2025-12-19T10:23:04.043240", - "status": "completed" + { + "cell_type": "markdown", + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", + "metadata": { + "papermill": { + "duration": 0.00017, + "end_time": "2025-12-19T10:23:03.342235", + "exception": false, + "start_time": "2025-12-19T10:23:03.342065", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.1. Load and check `config_json` file" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if REPORTING_RATE_PRODUCT_ID is configured\n", - "if (is.null(REPORTING_RATE_PRODUCT_ID) || length(REPORTING_RATE_PRODUCT_ID) == 0) {\n", - " log_msg(\"🚨 Warning: REPORTING_RATE_PRODUCT_ID is not configured properly in 'SNT_config.json'. \n", - " This will prevent filtering by reporting dataset, and all values will be retained.\", level = \"warning\" )\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", - "metadata": { - "papermill": { - "duration": 0.000139, - "end_time": "2025-12-19T10:23:04.549558", - "exception": false, - "start_time": "2025-12-19T10:23:04.549419", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:03.351367Z", + "iopub.status.busy": "2025-12-19T10:23:03.348819Z", + "iopub.status.idle": "2025-12-19T10:23:03.979814Z", + "shell.execute_reply": "2025-12-19T10:23:03.976617Z" + }, + "papermill": { + "duration": 0.640406, + "end_time": "2025-12-19T10:23:03.982829", + "exception": false, + "start_time": "2025-12-19T10:23:03.342423", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + ] }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", - "metadata": { - "papermill": { - "duration": 0.000152, - "end_time": "2025-12-19T10:23:04.549924", - "exception": false, - "start_time": "2025-12-19T10:23:04.549772", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:03.987632Z", + "iopub.status.busy": "2025-12-19T10:23:03.985301Z", + "iopub.status.idle": "2025-12-19T10:23:04.011308Z", + "shell.execute_reply": "2025-12-19T10:23:04.008941Z" + }, + "papermill": { + "duration": 0.031002, + "end_time": "2025-12-19T10:23:04.014107", + "exception": false, + "start_time": "2025-12-19T10:23:03.983105", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# Which reporting rate PRODUCT_UID to use (DHIS2 dataset id)\n", + "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID \n", + "\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" + ] }, - "tags": [] - }, - "source": [ - "### 2.1. Load routine data (DHIS2) \n", - "Already formatted routine data, we use this as the master table
\n", - "(only used at the very end before exporting the table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.554212Z", - "iopub.status.busy": "2025-12-19T10:23:04.552423Z", - "iopub.status.idle": "2025-12-19T10:23:05.773324Z", - "shell.execute_reply": "2025-12-19T10:23:05.771316Z" + { + "cell_type": "markdown", + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", + "metadata": { + "papermill": { + "duration": 0.00015, + "end_time": "2025-12-19T10:23:04.014523", + "exception": false, + "start_time": "2025-12-19T10:23:04.014373", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.2. Validate parameters" + ] }, - "papermill": { - "duration": 1.225668, - "end_time": "2025-12-19T10:23:05.775768", - "exception": false, - "start_time": "2025-12-19T10:23:04.550100", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.019283Z", + "iopub.status.busy": "2025-12-19T10:23:04.017257Z", + "iopub.status.idle": "2025-12-19T10:23:04.039652Z", + "shell.execute_reply": "2025-12-19T10:23:04.037292Z" + }, + "papermill": { + "duration": 0.02788, + "end_time": "2025-12-19T10:23:04.042642", + "exception": false, + "start_time": "2025-12-19T10:23:04.014762", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# default: raw routine\n", + "if (!exists(\"ROUTINE_FILE\")) { ROUTINE_FILE <- glue::glue(\"{COUNTRY_CODE}_routine.parquet\") }" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# select dataset\n", - "if (ROUTINE_FILE == glue::glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "} else {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - "}\n", - "\n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", - "\n", - "# Subset data to keep only columns defined in fixed_cols_rr (if defined)\n", - "if (exists(\"fixed_cols_rr\")) {\n", - " dhis2_routine <- dhis2_routine %>% \n", - " select(any_of(fixed_cols_rr)) |> \n", - " distinct()\n", - "}\n", - "\n", - "# log\n", - "log_msg(glue::glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset : {rountine_dataset_name} dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452", - "metadata": { - "papermill": { - "duration": 0.000155, - "end_time": "2025-12-19T10:23:05.776205", - "exception": false, - "start_time": "2025-12-19T10:23:05.776050", - "status": "completed" + { + "cell_type": "markdown", + "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650", + "metadata": { + "papermill": { + "duration": 0.000144, + "end_time": "2025-12-19T10:23:04.043066", + "exception": false, + "start_time": "2025-12-19T10:23:04.042922", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.3. 🔍 Check REPORTING_RATE_PRODUCT_ID is configured" + ] }, - "tags": [] - }, - "source": [ - "### 2.2. Load Reporting Rate data (DHIS2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e352c76-f2fb-43ba-b85d-391d808057a8", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:05.780487Z", - "iopub.status.busy": "2025-12-19T10:23:05.778651Z", - "iopub.status.idle": "2025-12-19T10:23:07.096742Z", - "shell.execute_reply": "2025-12-19T10:23:07.094774Z" + { + "cell_type": "markdown", + "id": "682a62d5", + "metadata": {}, + "source": [ + "### 🐍 This probably to be moved to pipeline.py code?" + ] }, - "papermill": { - "duration": 1.322737, - "end_time": "2025-12-19T10:23:07.099136", - "exception": false, - "start_time": "2025-12-19T10:23:05.776399", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\") # reporting rate file\n", - "\n", - "# Load file from dataset\n", - "dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 dataset reporting rates file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "dhis2_reporting <- dhis2_reporting %>% mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric)) # numeric values\n", - "\n", - "msg <- paste0(\"DHIS2 Datatset reporting data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). \n", - "Dataframe dimensions: \", \n", - " paste(dim(dhis2_reporting), collapse=\", \"))\n", - "log_msg(msg)\n", - "head(dhis2_reporting, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "4d5f398b", - "metadata": { - "papermill": { - "duration": 0.000151, - "end_time": "2025-12-19T10:23:07.099531", - "exception": false, - "start_time": "2025-12-19T10:23:07.099380", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 3. Transform reporting data" - ] - }, - { - "cell_type": "markdown", - "id": "adcbee0b", - "metadata": { - "papermill": { - "duration": 0.0001, - "end_time": "2025-12-19T10:23:07.099849", - "exception": false, - "start_time": "2025-12-19T10:23:07.099749", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.1. Filter Reporting Rate data by \"Dataset\" (`PRODUCT_UID`)\n", - "Logic:\n", - "* Value(s) (string) for `PRODUCT_UID` defined in the config.json file\n", - "* If none provided (**empty** field) skip filtering and **keep everything**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "795a5e74", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:07.104617Z", - "iopub.status.busy": "2025-12-19T10:23:07.102475Z", - "iopub.status.idle": "2025-12-19T10:23:08.406561Z", - "shell.execute_reply": "2025-12-19T10:23:08.404419Z" + { + "cell_type": "code", + "execution_count": null, + "id": "7469898d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.047782Z", + "iopub.status.busy": "2025-12-19T10:23:04.045631Z", + "iopub.status.idle": "2025-12-19T10:23:04.545551Z", + "shell.execute_reply": "2025-12-19T10:23:04.542372Z" + }, + "papermill": { + "duration": 0.505908, + "end_time": "2025-12-19T10:23:04.549148", + "exception": false, + "start_time": "2025-12-19T10:23:04.043240", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if REPORTING_RATE_PRODUCT_ID is configured\n", + "if (is.null(REPORTING_RATE_PRODUCT_ID) || length(REPORTING_RATE_PRODUCT_ID) == 0) {\n", + " log_msg(\"🚨 Warning: REPORTING_RATE_PRODUCT_ID is not configured properly in 'SNT_config.json'. \n", + " This will prevent filtering by reporting dataset, and all values will be retained.\", level = \"warning\" )\n", + "}" + ] }, - "papermill": { - "duration": 1.309322, - "end_time": "2025-12-19T10:23:08.409343", - "exception": false, - "start_time": "2025-12-19T10:23:07.100021", - "status": "completed" + { + "cell_type": "markdown", + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", + "metadata": { + "papermill": { + "duration": 0.000139, + "end_time": "2025-12-19T10:23:04.549558", + "exception": false, + "start_time": "2025-12-19T10:23:04.549419", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 2. Load Data" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if REPORTING_RATE_PRODUCT_ID present in the data: if yes, filter to keep only those, else skip filtering (keep all) and log a warning\n", - "if (all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", - " dhis2_reporting <- dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", - " log_msg(glue::glue(\"🪮 Filtering DHIS2 reporting data to keep only values for REPORTING_RATE_PRODUCT_UID(s): {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')}.\n", - " Removed {nrow(dhis2_reporting) - nrow(dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID))} rows.\n", - " Dataframe dimensions after filtering: {paste(dim(dhis2_reporting), collapse=', ')}\"))\n", - "} else {\n", - " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data PRODUCT_UIDs: {paste(unique(dhis2_reporting$PRODUCT_UID), collapse=', ')}. \n", - " 🦘 Skipping filtering and keeping all data. Dataframe dimensions: {paste(dim(dhis2_reporting), collapse=', ')}\"), level = \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "4237408a", - "metadata": { - "papermill": { - "duration": 0.000133, - "end_time": "2025-12-19T10:23:08.409660", - "exception": false, - "start_time": "2025-12-19T10:23:08.409527", - "status": "completed" + { + "cell_type": "markdown", + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", + "metadata": { + "papermill": { + "duration": 0.000152, + "end_time": "2025-12-19T10:23:04.549924", + "exception": false, + "start_time": "2025-12-19T10:23:04.549772", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.1. Load routine data (DHIS2) \n", + "Already formatted routine data, we use this as the master table
\n", + "(only used at the very end before exporting the table)" + ] }, - "tags": [] - }, - "source": [ - "### 3.2. Pivot wider" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c3b9a65", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:08.413415Z", - "iopub.status.busy": "2025-12-19T10:23:08.411805Z", - "iopub.status.idle": "2025-12-19T10:23:08.884793Z", - "shell.execute_reply": "2025-12-19T10:23:08.880916Z" + { + "cell_type": "code", + "execution_count": null, + "id": "a1213723-f7e2-4238-9f37-f1795b187232", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.554212Z", + "iopub.status.busy": "2025-12-19T10:23:04.552423Z", + "iopub.status.idle": "2025-12-19T10:23:05.773324Z", + "shell.execute_reply": "2025-12-19T10:23:05.771316Z" + }, + "papermill": { + "duration": 1.225668, + "end_time": "2025-12-19T10:23:05.775768", + "exception": false, + "start_time": "2025-12-19T10:23:04.550100", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# select dataset\n", + "if (ROUTINE_FILE == glue::glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", + " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "} else {\n", + " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", + "}\n", + "\n", + "# Load file from dataset\n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", + "\n", + "# Subset data to keep only columns defined in fixed_cols_rr (if defined)\n", + "if (exists(\"fixed_cols_rr\")) {\n", + " dhis2_routine <- dhis2_routine %>% \n", + " select(any_of(fixed_cols_rr)) |> \n", + " distinct()\n", + "}\n", + "\n", + "# log\n", + "log_msg(glue::glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset : {rountine_dataset_name} dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 3)" + ] }, - "papermill": { - "duration": 0.479538, - "end_time": "2025-12-19T10:23:08.889341", - "exception": false, - "start_time": "2025-12-19T10:23:08.409803", - "status": "completed" + { + "cell_type": "markdown", + "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452", + "metadata": { + "papermill": { + "duration": 0.000155, + "end_time": "2025-12-19T10:23:05.776205", + "exception": false, + "start_time": "2025-12-19T10:23:05.776050", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.2. Load Reporting Rate data (DHIS2)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Pivot wider to have one column per PRODUCT_METRIC (which now indicates whether the VALUE is \"ACTUAL_REPORTS\" or \"EXPECTED_REPORTS\")\n", - "dhis2_reporting_wide <- dhis2_reporting %>%\n", - " pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", - "\n", - "# Log msg\n", - "log_msg(glue::glue(\"Pivoted DHIS2 reporting data to wide format, with one column per PRODUCT_METRIC (ACTUAL_REPORTS, EXPECTED_REPORTS).\n", - "Dimensions after pivot: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", - "\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "0f485148", - "metadata": { - "papermill": { - "duration": 0.000186, - "end_time": "2025-12-19T10:23:08.889829", - "exception": false, - "start_time": "2025-12-19T10:23:08.889643", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "0e352c76-f2fb-43ba-b85d-391d808057a8", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:05.780487Z", + "iopub.status.busy": "2025-12-19T10:23:05.778651Z", + "iopub.status.idle": "2025-12-19T10:23:07.096742Z", + "shell.execute_reply": "2025-12-19T10:23:07.094774Z" + }, + "papermill": { + "duration": 1.322737, + "end_time": "2025-12-19T10:23:07.099136", + "exception": false, + "start_time": "2025-12-19T10:23:05.776399", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\") # reporting rate file\n", + "\n", + "# Load file from dataset\n", + "dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", + " error = function(e) {\n", + " msg <- paste(\"[ERROR] Error while loading DHIS2 dataset reporting rates file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "dhis2_reporting <- dhis2_reporting %>% mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric)) # numeric values\n", + "\n", + "msg <- paste0(\"DHIS2 Datatset reporting data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). \n", + "Dataframe dimensions: \", \n", + " paste(dim(dhis2_reporting), collapse=\", \"))\n", + "log_msg(msg)\n", + "head(dhis2_reporting, 3)" + ] }, - "tags": [] - }, - "source": [ - "### 👯 Handle **duplicated** values (`OU_ID`)\n", - "Using multiple datasets relies on the **assumption** that **each dataset is complementary to the other(s)**. Namely, there should be no \"dupliacted\" orgunits that are counted in more than one dataset! Else, we would be **double counting**." - ] - }, - { - "cell_type": "markdown", - "id": "55dececa", - "metadata": { - "papermill": { - "duration": 0.000122, - "end_time": "2025-12-19T10:23:08.890157", - "exception": false, - "start_time": "2025-12-19T10:23:08.890035", - "status": "completed" + { + "cell_type": "markdown", + "id": "4d5f398b", + "metadata": { + "papermill": { + "duration": 0.000151, + "end_time": "2025-12-19T10:23:07.099531", + "exception": false, + "start_time": "2025-12-19T10:23:07.099380", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 3. Transform reporting data" + ] }, - "tags": [] - }, - "source": [ - "#### Check for duplicated values (`OU_ID`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d761bd15", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:08.899486Z", - "iopub.status.busy": "2025-12-19T10:23:08.894706Z", - "iopub.status.idle": "2025-12-19T10:23:09.476248Z", - "shell.execute_reply": "2025-12-19T10:23:09.470283Z" + { + "cell_type": "markdown", + "id": "adcbee0b", + "metadata": { + "papermill": { + "duration": 0.0001, + "end_time": "2025-12-19T10:23:07.099849", + "exception": false, + "start_time": "2025-12-19T10:23:07.099749", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.1. Filter Reporting Rate data by \"Dataset\" (`PRODUCT_UID`)\n", + "Logic:\n", + "* Value(s) (string) for `PRODUCT_UID` defined in the config.json file\n", + "* If none provided (**empty** field) skip filtering and **keep everything**" + ] }, - "papermill": { - "duration": 0.590832, - "end_time": "2025-12-19T10:23:09.481144", - "exception": false, - "start_time": "2025-12-19T10:23:08.890312", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "795a5e74", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:07.104617Z", + "iopub.status.busy": "2025-12-19T10:23:07.102475Z", + "iopub.status.idle": "2025-12-19T10:23:08.406561Z", + "shell.execute_reply": "2025-12-19T10:23:08.404419Z" + }, + "papermill": { + "duration": 1.309322, + "end_time": "2025-12-19T10:23:08.409343", + "exception": false, + "start_time": "2025-12-19T10:23:07.100021", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if REPORTING_RATE_PRODUCT_ID present in the data: if yes, filter to keep only those, else skip filtering (keep all) and log a warning\n", + "if (all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", + " dhis2_reporting <- dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", + " log_msg(glue::glue(\"🪮 Filtering DHIS2 reporting data to keep only values for REPORTING_RATE_PRODUCT_UID(s): {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')}.\n", + " Removed {nrow(dhis2_reporting) - nrow(dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID))} rows.\n", + " Dataframe dimensions after filtering: {paste(dim(dhis2_reporting), collapse=', ')}\"))\n", + "} else {\n", + " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data PRODUCT_UIDs: {paste(unique(dhis2_reporting$PRODUCT_UID), collapse=', ')}. \n", + " 🦘 Skipping filtering and keeping all data. Dataframe dimensions: {paste(dim(dhis2_reporting), collapse=', ')}\"), level = \"warning\")\n", + "}" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if any OU_ID is present in more than one PRODUCT_UID\n", - "# and if so list them\n", - "ou_product_counts <- dhis2_reporting %>%\n", - " group_by(OU_ID, OU_NAME) %>%\n", - " mutate(PRODUCT_UID_count = n_distinct(PRODUCT_UID)) %>%\n", - " filter(PRODUCT_UID_count > 1) %>%\n", - " select(ADM1_NAME, ADM2_NAME, OU_ID, OU_NAME, PRODUCT_UID_count) %>%\n", - " distinct() \n", - "\n", - "ou_product_counts\n", - "\n", - "# Log msg: which OU_ID have multiple PRODUCT_UIDs\n", - "if (nrow(ou_product_counts) > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: The following OU_IDs are associated with multiple PRODUCT_UIDs in the DHIS2 reporting data:\n", - "{paste(apply(ou_product_counts, 1, function(row) paste0(' - ', row['OU_NAME'], ' (', row['OU_ID'], ')')), collapse='\\n')}\"), \n", - " level = \"warning\")\n", - "} else {\n", - " log_msg(\"All OU_IDs are associated with a single PRODUCT_UID in the DHIS2 reporting data.\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "805ed555", - "metadata": { - "papermill": { - "duration": 0.000139, - "end_time": "2025-12-19T10:23:09.481549", - "exception": false, - "start_time": "2025-12-19T10:23:09.481410", - "status": "completed" + { + "cell_type": "markdown", + "id": "4237408a", + "metadata": { + "papermill": { + "duration": 0.000133, + "end_time": "2025-12-19T10:23:08.409660", + "exception": false, + "start_time": "2025-12-19T10:23:08.409527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.2. Pivot wider" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "#### Remove duplicated OU_IDs (shared across PRODUCT_UIDs)\n", - "Logic: \n", - "1. Identify if any `OU_ID` is present in both datasets\n", - "2. For these, keep `max(ACTUAL_REPORTS)` (since `EXPECTED_REPORTS` is always == 1) because: \n", - " * if both same value (either both 0 or both 1) => simply deduplicate (`distinct()`)\n", - " * if else if different values, meaning that one dataset say 1 and the other 0 => keep 1 (facility _did_ report)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "593b013a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:09.488856Z", - "iopub.status.busy": "2025-12-19T10:23:09.484674Z", - "iopub.status.idle": "2025-12-19T10:23:13.563200Z", - "shell.execute_reply": "2025-12-19T10:23:13.559294Z" + { + "cell_type": "code", + "execution_count": null, + "id": "5c3b9a65", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:08.413415Z", + "iopub.status.busy": "2025-12-19T10:23:08.411805Z", + "iopub.status.idle": "2025-12-19T10:23:08.884793Z", + "shell.execute_reply": "2025-12-19T10:23:08.880916Z" + }, + "papermill": { + "duration": 0.479538, + "end_time": "2025-12-19T10:23:08.889341", + "exception": false, + "start_time": "2025-12-19T10:23:08.409803", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Pivot wider to have one column per PRODUCT_METRIC (which now indicates whether the VALUE is \"ACTUAL_REPORTS\" or \"EXPECTED_REPORTS\")\n", + "dhis2_reporting_wide <- dhis2_reporting %>%\n", + " pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", + "\n", + "# Log msg\n", + "log_msg(glue::glue(\"Pivoted DHIS2 reporting data to wide format, with one column per PRODUCT_METRIC (ACTUAL_REPORTS, EXPECTED_REPORTS).\n", + "Dimensions after pivot: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", + "\n", + "dim(dhis2_reporting_wide)\n", + "head(dhis2_reporting_wide, 3)" + ] }, - "papermill": { - "duration": 4.086946, - "end_time": "2025-12-19T10:23:13.568699", - "exception": false, - "start_time": "2025-12-19T10:23:09.481753", - "status": "completed" + { + "cell_type": "markdown", + "id": "0f485148", + "metadata": { + "papermill": { + "duration": 0.000186, + "end_time": "2025-12-19T10:23:08.889829", + "exception": false, + "start_time": "2025-12-19T10:23:08.889643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 👯 Handle **duplicated** values (`OU_ID`)\n", + "Using multiple datasets relies on the **assumption** that **each dataset is complementary to the other(s)**. Namely, there should be no \"dupliacted\" orgunits that are counted in more than one dataset! Else, we would be **double counting**." + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: check for duplicated OU_ID by PERIOD (there should be only 1 value of OU_ID per PERIOD)\n", - "dupl_ou_period <- dhis2_reporting_wide %>%\n", - " group_by(OU_ID, PERIOD) %>%\n", - " filter(n() > 1) %>%\n", - " ungroup() %>%\n", - " select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, ends_with(\"REPORTS\"))\n", - "\n", - "# Log msg\n", - "if (nrow(dupl_ou_period) > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: The OU_IDs are associated with multiple PRODUCT_UIDs affect {nrow(dupl_ou_period)} PERIOD entries (rows) in the DHIS2 reporting data.\"))\n", - "}\n", - "\n", - "dim(dupl_ou_period)\n", - "head(dupl_ou_period, 5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c72bd93a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:13.581200Z", - "iopub.status.busy": "2025-12-19T10:23:13.574942Z", - "iopub.status.idle": "2025-12-19T10:23:18.911910Z", - "shell.execute_reply": "2025-12-19T10:23:18.907746Z" + { + "cell_type": "markdown", + "id": "55dececa", + "metadata": { + "papermill": { + "duration": 0.000122, + "end_time": "2025-12-19T10:23:08.890157", + "exception": false, + "start_time": "2025-12-19T10:23:08.890035", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Check for duplicated values (`OU_ID`)" + ] }, - "papermill": { - "duration": 5.346749, - "end_time": "2025-12-19T10:23:18.915815", - "exception": false, - "start_time": "2025-12-19T10:23:13.569066", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "d761bd15", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:08.899486Z", + "iopub.status.busy": "2025-12-19T10:23:08.894706Z", + "iopub.status.idle": "2025-12-19T10:23:09.476248Z", + "shell.execute_reply": "2025-12-19T10:23:09.470283Z" + }, + "papermill": { + "duration": 0.590832, + "end_time": "2025-12-19T10:23:09.481144", + "exception": false, + "start_time": "2025-12-19T10:23:08.890312", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if any OU_ID is present in more than one PRODUCT_UID\n", + "# and if so list them\n", + "ou_product_counts <- dhis2_reporting %>%\n", + " group_by(OU_ID, OU_NAME) %>%\n", + " mutate(PRODUCT_UID_count = n_distinct(PRODUCT_UID)) %>%\n", + " filter(PRODUCT_UID_count > 1) %>%\n", + " select(ADM1_NAME, ADM2_NAME, OU_ID, OU_NAME, PRODUCT_UID_count) %>%\n", + " distinct() \n", + "\n", + "ou_product_counts\n", + "\n", + "# Log msg: which OU_ID have multiple PRODUCT_UIDs\n", + "if (nrow(ou_product_counts) > 0) {\n", + " log_msg(glue::glue(\"🚨 Warning: The following OU_IDs are associated with multiple PRODUCT_UIDs in the DHIS2 reporting data:\n", + "{paste(apply(ou_product_counts, 1, function(row) paste0(' - ', row['OU_NAME'], ' (', row['OU_ID'], ')')), collapse='\\n')}\"), \n", + " level = \"warning\")\n", + "} else {\n", + " log_msg(\"All OU_IDs are associated with a single PRODUCT_UID in the DHIS2 reporting data.\")\n", + "}" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 2: remove duplicated OU_ID by PERIOD\n", - "# Use the following logic:\n", - "# - 1. first, check that values (ACTUAL_REPORTS, EXPECTED_REPORTS) are all 0 or 1 (if not that needs to be handled differently, so skip for now)\n", - "# - 2. then, if multiple PRODUCT_UIDs exist for the same OU_ID and PERIOD, keep the one with the highest ACTUAL_REPORTS value\n", - "# (this is because if values agree, then we can simply keep one, if they don't agree, that means that we have 1 and 0 values, so we keep the 1)\n", - "\n", - "if (all(dupl_ou_period$ACTUAL_REPORTS %in% c(0,1)) & all(dupl_ou_period$EXPECTED_REPORTS %in% c(0,1))) {\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " group_by(PERIOD, OU_ID) %>%\n", - " mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", - " ungroup() %>%\n", - " filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", - " select(-ACTUAL_REPORTS_deduplicated)\n", - "\n", - " log_msg(glue::glue(\"✅ Deduplicated DHIS2 reporting data by keeping only one PRODUCT_UID per OU_ID and PERIOD, based on highest ACTUAL_REPORTS value.\n", - " Dataframe dimensions after deduplication: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", - "} else {\n", - " log_msg(\"🚨 Warning: Cannot deduplicate OU_ID by PERIOD in DHIS2 reporting data because ACTUAL_REPORTS or EXPECTED_REPORTS contain values other than 0 or 1. \n", - " Analysis will continue without removing duplicated entries.\", level = \"warning\")\n", - "} \n", - "\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "2f26c614", - "metadata": { - "papermill": { - "duration": 0.000236, - "end_time": "2025-12-19T10:23:18.916421", - "exception": false, - "start_time": "2025-12-19T10:23:18.916185", - "status": "completed" + { + "cell_type": "markdown", + "id": "805ed555", + "metadata": { + "papermill": { + "duration": 0.000139, + "end_time": "2025-12-19T10:23:09.481549", + "exception": false, + "start_time": "2025-12-19T10:23:09.481410", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "#### Remove duplicated OU_IDs (shared across PRODUCT_UIDs)\n", + "Logic: \n", + "1. Identify if any `OU_ID` is present in both datasets\n", + "2. For these, keep `max(ACTUAL_REPORTS)` (since `EXPECTED_REPORTS` is always == 1) because: \n", + " * if both same value (either both 0 or both 1) => simply deduplicate (`distinct()`)\n", + " * if else if different values, meaning that one dataset say 1 and the other 0 => keep 1 (facility _did_ report)" + ] }, - "tags": [] - }, - "source": [ - "### 3.3. (🇳🇪 NER only) Make HOP aggregated values (0, >1) into presence/absence (0, 1)\n", - "Specific for Niger SNIS instance!
\n", - "Values for dataset HOP (\"ki7YKOfyxjf\" = \"HOP 03 ACTIVITES DE LUTTE CONTRE LE PALUDISME\") count the individual \"sub-units\" (departments, etc ... ) of a given hospital and therefore can have values >1.
\n", - "For consistency with CSI (where all values are raw, and therefore only 0 and 1), we need to convert all HOP value >1 into 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4118991c", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:18.924306Z", - "iopub.status.busy": "2025-12-19T10:23:18.920810Z", - "iopub.status.idle": "2025-12-19T10:23:19.482033Z", - "shell.execute_reply": "2025-12-19T10:23:19.479013Z" + { + "cell_type": "code", + "execution_count": null, + "id": "593b013a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:09.488856Z", + "iopub.status.busy": "2025-12-19T10:23:09.484674Z", + "iopub.status.idle": "2025-12-19T10:23:13.563200Z", + "shell.execute_reply": "2025-12-19T10:23:13.559294Z" + }, + "papermill": { + "duration": 4.086946, + "end_time": "2025-12-19T10:23:13.568699", + "exception": false, + "start_time": "2025-12-19T10:23:09.481753", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 1: check for duplicated OU_ID by PERIOD (there should be only 1 value of OU_ID per PERIOD)\n", + "dupl_ou_period <- dhis2_reporting_wide %>%\n", + " group_by(OU_ID, PERIOD) %>%\n", + " filter(n() > 1) %>%\n", + " ungroup() %>%\n", + " select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, ends_with(\"REPORTS\"))\n", + "\n", + "# Log msg\n", + "if (nrow(dupl_ou_period) > 0) {\n", + " log_msg(glue::glue(\"🚨 Warning: The OU_IDs are associated with multiple PRODUCT_UIDs affect {nrow(dupl_ou_period)} PERIOD entries (rows) in the DHIS2 reporting data.\"))\n", + "}\n", + "\n", + "dim(dupl_ou_period)\n", + "head(dupl_ou_period, 5)" + ] }, - "papermill": { - "duration": 0.56938, - "end_time": "2025-12-19T10:23:19.486133", - "exception": false, - "start_time": "2025-12-19T10:23:18.916753", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "c72bd93a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:13.581200Z", + "iopub.status.busy": "2025-12-19T10:23:13.574942Z", + "iopub.status.idle": "2025-12-19T10:23:18.911910Z", + "shell.execute_reply": "2025-12-19T10:23:18.907746Z" + }, + "papermill": { + "duration": 5.346749, + "end_time": "2025-12-19T10:23:18.915815", + "exception": false, + "start_time": "2025-12-19T10:23:13.569066", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 2: remove duplicated OU_ID by PERIOD\n", + "# Use the following logic:\n", + "# - 1. first, check that values (ACTUAL_REPORTS, EXPECTED_REPORTS) are all 0 or 1 (if not that needs to be handled differently, so skip for now)\n", + "# - 2. then, if multiple PRODUCT_UIDs exist for the same OU_ID and PERIOD, keep the one with the highest ACTUAL_REPORTS value\n", + "# (this is because if values agree, then we can simply keep one, if they don't agree, that means that we have 1 and 0 values, so we keep the 1)\n", + "\n", + "if (all(dupl_ou_period$ACTUAL_REPORTS %in% c(0,1)) & all(dupl_ou_period$EXPECTED_REPORTS %in% c(0,1))) {\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " group_by(PERIOD, OU_ID) %>%\n", + " mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", + " ungroup() %>%\n", + " filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", + " select(-ACTUAL_REPORTS_deduplicated)\n", + "\n", + " log_msg(glue::glue(\"✅ Deduplicated DHIS2 reporting data by keeping only one PRODUCT_UID per OU_ID and PERIOD, based on highest ACTUAL_REPORTS value.\n", + " Dataframe dimensions after deduplication: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", + "} else {\n", + " log_msg(\"🚨 Warning: Cannot deduplicate OU_ID by PERIOD in DHIS2 reporting data because ACTUAL_REPORTS or EXPECTED_REPORTS contain values other than 0 or 1. \n", + " Analysis will continue without removing duplicated entries.\", level = \"warning\")\n", + "} \n", + "\n", + "dim(dhis2_reporting_wide)\n", + "head(dhis2_reporting_wide, 3)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Modify dhis2_reporting_wide to replace all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " log_msg(\"🇳🇪 Special handling for NER: replacing all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1.\")\n", - "\n", - " # Check if any values >1 exist\n", - " n_actual_reports_gt1 <- sum(dhis2_reporting_wide$ACTUAL_REPORTS > 1, na.rm = TRUE)\n", - " n_expected_reports_gt1 <- sum(dhis2_reporting_wide$EXPECTED_REPORTS > 1, na.rm = TRUE)\n", - "\n", - " # Extract the PRODUCT_UID and PRODUCT_NAME associated with those values\n", - " if (n_actual_reports_gt1 > 0 | n_expected_reports_gt1 > 0) {\n", - " dupl_actual_reports <- dhis2_reporting_wide %>%\n", - " filter(ACTUAL_REPORTS > 1) %>%\n", - " select(PRODUCT_UID, PRODUCT_NAME) %>%\n", - " distinct()\n", - "\n", - " log_msg(glue::glue(\"Note: Found {n_actual_reports_gt1} entries with ACTUAL_REPORTS > 1 and {n_expected_reports_gt1} entries with EXPECTED_REPORTS > 1.\n", - "Affected PRODUCT_UIDs and PRODUCT_NAMEs for ACTUAL_REPORTS > 1:\n", - "{paste(apply(dupl_actual_reports, 1, function(row) paste0(row['PRODUCT_NAME'], ' (', row['PRODUCT_UID'], ')')), collapse='\\n')}\"))\n", - "\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " mutate(\n", - " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", - " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", - " )\n", - "\n", - " log_msg(\"✅ Replaced all values of ACTUAL_REPORTS and EXPECTED_REPORTS that were >1 with 1.\")\n", - "\n", - "} # else nothing to replace\n", - "\n", - " dim(dhis2_reporting_wide)\n", - " head(dhis2_reporting_wide, 3)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "066319a3", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2025-12-19T10:23:19.486674", - "exception": false, - "start_time": "2025-12-19T10:23:19.486502", - "status": "completed" + { + "cell_type": "markdown", + "id": "2f26c614", + "metadata": { + "papermill": { + "duration": 0.000236, + "end_time": "2025-12-19T10:23:18.916421", + "exception": false, + "start_time": "2025-12-19T10:23:18.916185", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.3. (🇳🇪 NER only) Make HOP aggregated values (0, >1) into presence/absence (0, 1)\n", + "Specific for Niger SNIS instance!
\n", + "Values for dataset HOP (\"ki7YKOfyxjf\" = \"HOP 03 ACTIVITES DE LUTTE CONTRE LE PALUDISME\") count the individual \"sub-units\" (departments, etc ... ) of a given hospital and therefore can have values >1.
\n", + "For consistency with CSI (where all values are raw, and therefore only 0 and 1), we need to convert all HOP value >1 into 1." + ] }, - "tags": [] - }, - "source": [ - "### 3.4. Aggregate at AMD2 level" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e94eeddd", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:19.494212Z", - "iopub.status.busy": "2025-12-19T10:23:19.491141Z", - "iopub.status.idle": "2025-12-19T10:23:19.791631Z", - "shell.execute_reply": "2025-12-19T10:23:19.786378Z" + { + "cell_type": "code", + "execution_count": null, + "id": "4118991c", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:18.924306Z", + "iopub.status.busy": "2025-12-19T10:23:18.920810Z", + "iopub.status.idle": "2025-12-19T10:23:19.482033Z", + "shell.execute_reply": "2025-12-19T10:23:19.479013Z" + }, + "papermill": { + "duration": 0.56938, + "end_time": "2025-12-19T10:23:19.486133", + "exception": false, + "start_time": "2025-12-19T10:23:18.916753", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Modify dhis2_reporting_wide to replace all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1\n", + "if (COUNTRY_CODE == \"NER\") {\n", + " log_msg(\"🇳🇪 Special handling for NER: replacing all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1.\")\n", + "\n", + " # Check if any values >1 exist\n", + " n_actual_reports_gt1 <- sum(dhis2_reporting_wide$ACTUAL_REPORTS > 1, na.rm = TRUE)\n", + " n_expected_reports_gt1 <- sum(dhis2_reporting_wide$EXPECTED_REPORTS > 1, na.rm = TRUE)\n", + "\n", + " # Extract the PRODUCT_UID and PRODUCT_NAME associated with those values\n", + " if (n_actual_reports_gt1 > 0 | n_expected_reports_gt1 > 0) {\n", + " dupl_actual_reports <- dhis2_reporting_wide %>%\n", + " filter(ACTUAL_REPORTS > 1) %>%\n", + " select(PRODUCT_UID, PRODUCT_NAME) %>%\n", + " distinct()\n", + "\n", + " log_msg(glue::glue(\"Note: Found {n_actual_reports_gt1} entries with ACTUAL_REPORTS > 1 and {n_expected_reports_gt1} entries with EXPECTED_REPORTS > 1.\n", + "Affected PRODUCT_UIDs and PRODUCT_NAMEs for ACTUAL_REPORTS > 1:\n", + "{paste(apply(dupl_actual_reports, 1, function(row) paste0(row['PRODUCT_NAME'], ' (', row['PRODUCT_UID'], ')')), collapse='\\n')}\"))\n", + "\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " mutate(\n", + " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", + " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", + " )\n", + "\n", + " log_msg(\"✅ Replaced all values of ACTUAL_REPORTS and EXPECTED_REPORTS that were >1 with 1.\")\n", + "\n", + "} # else nothing to replace\n", + "\n", + " dim(dhis2_reporting_wide)\n", + " head(dhis2_reporting_wide, 3)\n", + "}" + ] }, - "papermill": { - "duration": 0.308903, - "end_time": "2025-12-19T10:23:19.795888", - "exception": false, - "start_time": "2025-12-19T10:23:19.486985", - "status": "completed" + { + "cell_type": "markdown", + "id": "066319a3", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2025-12-19T10:23:19.486674", + "exception": false, + "start_time": "2025-12-19T10:23:19.486502", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.4. Aggregate at AMD2 level" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Sum up values (now at acility level) to get totals per ADM2_ID and PERIOD\n", - "dhis2_reporting_wide_adm2 <- dhis2_reporting_wide %>%\n", - " group_by(\n", - " PERIOD, \n", - " YEAR, MONTH, # keep these just for sanity check (not needed for grouping)\n", - " ADM1_NAME, ADM1_ID, # keep these just for sanity check (not needed for grouping)\n", - " ADM2_NAME, ADM2_ID\n", - " ) %>%\n", - " summarise(\n", - " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", - " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", - " .groups = 'drop'\n", - " ) \n", - "\n", - "# Add log messages\n", - "log_msg(glue::glue(\"DHIS2 reporting data pivoted to wide format and aggregated at ADM2 level. \n", - "Dataframe dimensions: {paste(dim(dhis2_reporting_wide_adm2), collapse=', ')}\"))\n", - "head(dhis2_reporting_wide_adm2, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "eb181891", - "metadata": { - "papermill": { - "duration": 0.000151, - "end_time": "2025-12-19T10:23:19.796350", - "exception": false, - "start_time": "2025-12-19T10:23:19.796199", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "e94eeddd", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:19.494212Z", + "iopub.status.busy": "2025-12-19T10:23:19.491141Z", + "iopub.status.idle": "2025-12-19T10:23:19.791631Z", + "shell.execute_reply": "2025-12-19T10:23:19.786378Z" + }, + "papermill": { + "duration": 0.308903, + "end_time": "2025-12-19T10:23:19.795888", + "exception": false, + "start_time": "2025-12-19T10:23:19.486985", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Sum up values (now at acility level) to get totals per ADM2_ID and PERIOD\n", + "dhis2_reporting_wide_adm2 <- dhis2_reporting_wide %>%\n", + " group_by(\n", + " PERIOD, \n", + " YEAR, MONTH, # keep these just for sanity check (not needed for grouping)\n", + " ADM1_NAME, ADM1_ID, # keep these just for sanity check (not needed for grouping)\n", + " ADM2_NAME, ADM2_ID\n", + " ) %>%\n", + " summarise(\n", + " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", + " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", + " .groups = 'drop'\n", + " ) \n", + "\n", + "# Add log messages\n", + "log_msg(glue::glue(\"DHIS2 reporting data pivoted to wide format and aggregated at ADM2 level. \n", + "Dataframe dimensions: {paste(dim(dhis2_reporting_wide_adm2), collapse=', ')}\"))\n", + "head(dhis2_reporting_wide_adm2, 3)" + ] }, - "tags": [] - }, - "source": [ - "### 3.5. Calculate REPORTING_RATE\n", - "**numerator**: `ACTUAL_REPORTS`
\n", - "**denominator**: `EXPECTED_REPORTS`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e90a1c20", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:19.803233Z", - "iopub.status.busy": "2025-12-19T10:23:19.799996Z", - "iopub.status.idle": "2025-12-19T10:23:19.994060Z", - "shell.execute_reply": "2025-12-19T10:23:19.991575Z" + { + "cell_type": "markdown", + "id": "eb181891", + "metadata": { + "papermill": { + "duration": 0.000151, + "end_time": "2025-12-19T10:23:19.796350", + "exception": false, + "start_time": "2025-12-19T10:23:19.796199", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.5. Calculate REPORTING_RATE\n", + "**numerator**: `ACTUAL_REPORTS`
\n", + "**denominator**: `EXPECTED_REPORTS`" + ] }, - "papermill": { - "duration": 0.200465, - "end_time": "2025-12-19T10:23:19.997024", - "exception": false, - "start_time": "2025-12-19T10:23:19.796559", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "e90a1c20", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:19.803233Z", + "iopub.status.busy": "2025-12-19T10:23:19.799996Z", + "iopub.status.idle": "2025-12-19T10:23:19.994060Z", + "shell.execute_reply": "2025-12-19T10:23:19.991575Z" + }, + "papermill": { + "duration": 0.200465, + "end_time": "2025-12-19T10:23:19.997024", + "exception": false, + "start_time": "2025-12-19T10:23:19.796559", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate REPORTING_RATE as ACTUAL_REPORTS / EXPECTED_REPORTS\n", + "reporting_rate_results <- dhis2_reporting_wide_adm2 %>%\n", + " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", + "\n", + "log_msg(glue::glue(\"DHIS2 reporting rate calculated as ACTUAL_REPORTS / EXPECTED_REPORTS. Dataframe dimensions: {paste(dim(reporting_rate_results), collapse=', ')}\"))\n", + "head(reporting_rate_results, 3) " + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Calculate REPORTING_RATE as ACTUAL_REPORTS / EXPECTED_REPORTS\n", - "reporting_rate_results <- dhis2_reporting_wide_adm2 %>%\n", - " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", - "\n", - "log_msg(glue::glue(\"DHIS2 reporting rate calculated as ACTUAL_REPORTS / EXPECTED_REPORTS. Dataframe dimensions: {paste(dim(reporting_rate_results), collapse=', ')}\"))\n", - "head(reporting_rate_results, 3) " - ] - }, - { - "cell_type": "markdown", - "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99", - "metadata": { - "papermill": { - "duration": 0.000123, - "end_time": "2025-12-19T10:23:19.997465", - "exception": false, - "start_time": "2025-12-19T10:23:19.997342", - "status": "completed" + { + "cell_type": "markdown", + "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99", + "metadata": { + "papermill": { + "duration": 0.000123, + "end_time": "2025-12-19T10:23:19.997465", + "exception": false, + "start_time": "2025-12-19T10:23:19.997342", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.6. Ensure consistency of table (probably can skip because all data comes from the same source!)\n", + "Left join reporting indicators with DHIS2 routine data.\n", + "Make sure we have a consistent reporting rates table matching periods x org units (safety measure only)." + ] }, - "tags": [] - }, - "source": [ - "### 3.6. Ensure consistency of table (probably can skip because all data comes from the same source!)\n", - "Left join reporting indicators with DHIS2 routine data.\n", - "Make sure we have a consistent reporting rates table matching periods x org units (safety measure only)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:20.001909Z", - "iopub.status.busy": "2025-12-19T10:23:19.999878Z", - "iopub.status.idle": "2025-12-19T10:23:20.072344Z", - "shell.execute_reply": "2025-12-19T10:23:20.070004Z" + { + "cell_type": "code", + "execution_count": null, + "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:20.001909Z", + "iopub.status.busy": "2025-12-19T10:23:19.999878Z", + "iopub.status.idle": "2025-12-19T10:23:20.072344Z", + "shell.execute_reply": "2025-12-19T10:23:20.070004Z" + }, + "papermill": { + "duration": 0.077426, + "end_time": "2025-12-19T10:23:20.075077", + "exception": false, + "start_time": "2025-12-19T10:23:19.997651", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "reporting_rate_dataset <- left_join(dhis2_routine, \n", + " reporting_rate_results %>% select(all_of(fixed_cols_rr)), \n", + " by=c(\"YEAR\", \"MONTH\", \"ADM2_ID\"))\n", + "\n", + "print(dim(reporting_rate_dataset))\n", + "head(reporting_rate_dataset, 3)" + ] }, - "papermill": { - "duration": 0.077426, - "end_time": "2025-12-19T10:23:20.075077", - "exception": false, - "start_time": "2025-12-19T10:23:19.997651", - "status": "completed" + { + "cell_type": "markdown", + "id": "6b19e88d", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2025-12-19T10:23:20.075561", + "exception": false, + "start_time": "2025-12-19T10:23:20.075388", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.7. Final visual check on REPORTING_RATE values" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "reporting_rate_dataset <- left_join(dhis2_routine, \n", - " reporting_rate_results %>% select(all_of(fixed_cols_rr)), \n", - " by=c(\"YEAR\", \"MONTH\", \"ADM2_ID\"))\n", - "\n", - "print(dim(reporting_rate_dataset))\n", - "head(reporting_rate_dataset, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "6b19e88d", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2025-12-19T10:23:20.075561", - "exception": false, - "start_time": "2025-12-19T10:23:20.075388", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "fbfec60f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add log message to communicate range of REPORTING_RATE values and warn if any values are outside [0,1]\n", + "min_rr <- min(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", + "max_rr <- max(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", + "if (min_rr < 0 | max_rr > 1) { \n", + " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE values are outside the expected range [0,1]. \n", + " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"), level = \"warning\")\n", + "} else {\n", + " log_msg(glue::glue(\"✅ REPORTING_RATE values are within the expected range [0,1]. \n", + " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"))\n", + "}" + ] }, - "tags": [] - }, - "source": [ - "### 3.7. Final visual check on REPORTING_RATE values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fbfec60f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Add log message to communicate range of REPORTING_RATE values and warn if any values are outside [0,1]\n", - "min_rr <- min(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", - "max_rr <- max(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", - "if (min_rr < 0 | max_rr > 1) { \n", - " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE values are outside the expected range [0,1]. \n", - " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"), level = \"warning\")\n", - "} else {\n", - " log_msg(glue::glue(\"✅ REPORTING_RATE values are within the expected range [0,1]. \n", - " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8878192f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:20.080475Z", - "iopub.status.busy": "2025-12-19T10:23:20.078272Z", - "iopub.status.idle": "2025-12-19T10:23:21.456898Z", - "shell.execute_reply": "2025-12-19T10:23:21.453352Z" + { + "cell_type": "code", + "execution_count": null, + "id": "8878192f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:20.080475Z", + "iopub.status.busy": "2025-12-19T10:23:20.078272Z", + "iopub.status.idle": "2025-12-19T10:23:21.456898Z", + "shell.execute_reply": "2025-12-19T10:23:21.453352Z" + }, + "papermill": { + "duration": 1.384875, + "end_time": "2025-12-19T10:23:21.460674", + "exception": false, + "start_time": "2025-12-19T10:23:20.075799", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Simple plot to visualize distribution of REPORTING_RATE\n", + "ggplot(reporting_rate_dataset, aes(x=REPORTING_RATE)) +\n", + " geom_histogram() +\n", + " labs(\n", + " x=\"Dataset Reporting Rate\", y=\"Frequency\",\n", + " title = glue::glue(\"Reporting rate values range from {round(min(reporting_rate_dataset$REPORTING_RATE), 2)} to {round(max(reporting_rate_dataset$REPORTING_RATE), 2)}\")\n", + " ) +\n", + " theme_minimal()" + ] }, - "papermill": { - "duration": 1.384875, - "end_time": "2025-12-19T10:23:21.460674", - "exception": false, - "start_time": "2025-12-19T10:23:20.075799", - "status": "completed" + { + "cell_type": "markdown", + "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60", + "metadata": { + "papermill": { + "duration": 0.000104, + "end_time": "2025-12-19T10:23:21.460981", + "exception": false, + "start_time": "2025-12-19T10:23:21.460877", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 4. 📁 Export to `data/` folder\n", + "Export as both .csv and .parquet file formats." + ] }, - "tags": [], - "vscode": { - "languageId": "r" + { + "cell_type": "code", + "execution_count": null, + "id": "9adc033d-18d6-4786-8f96-21337b3e005f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:21.467337Z", + "iopub.status.busy": "2025-12-19T10:23:21.464010Z", + "iopub.status.idle": "2025-12-19T10:23:22.383295Z", + "shell.execute_reply": "2025-12-19T10:23:22.379935Z" + }, + "papermill": { + "duration": 0.926094, + "end_time": "2025-12-19T10:23:22.387190", + "exception": false, + "start_time": "2025-12-19T10:23:21.461096", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", + "\n", + "# parquet\n", + "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")) \n", + "write_parquet(reporting_rate_dataset, file_path)\n", + "log_msg(glue(\"Exported : {file_path}\"))\n", + "\n", + "# csv\n", + "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\"))\n", + "write.csv(reporting_rate_dataset, file_path, row.names = FALSE)\n", + "log_msg(glue(\"Exported : {file_path}\"))" + ] } - }, - "outputs": [], - "source": [ - "# Simple plot to visualize distribution of REPORTING_RATE\n", - "ggplot(reporting_rate_dataset, aes(x=REPORTING_RATE)) +\n", - " geom_histogram() +\n", - " labs(\n", - " x=\"Dataset Reporting Rate\", y=\"Frequency\",\n", - " title = glue::glue(\"Reporting rate values range from {round(min(reporting_rate_dataset$REPORTING_RATE), 2)} to {round(max(reporting_rate_dataset$REPORTING_RATE), 2)}\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "markdown", - "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60", - "metadata": { - "papermill": { - "duration": 0.000104, - "end_time": "2025-12-19T10:23:21.460981", - "exception": false, - "start_time": "2025-12-19T10:23:21.460877", - "status": "completed" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "tags": [] - }, - "source": [ - "## 4. 📁 Export to `data/` folder\n", - "Export as both .csv and .parquet file formats." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9adc033d-18d6-4786-8f96-21337b3e005f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:21.467337Z", - "iopub.status.busy": "2025-12-19T10:23:21.464010Z", - "iopub.status.idle": "2025-12-19T10:23:22.383295Z", - "shell.execute_reply": "2025-12-19T10:23:22.379935Z" + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, "papermill": { - "duration": 0.926094, - "end_time": "2025-12-19T10:23:22.387190", - "exception": false, - "start_time": "2025-12-19T10:23:21.461096", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" + "default_parameters": {}, + "duration": 94.192072, + "end_time": "2025-12-19T10:23:22.614345", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/papermill_outputs/snt_dhis2_reporting_rate_dataset_OUTPUT_2025-12-19_102148.ipynb", + "parameters": { + "ROUTINE_FILE": "NER_routine_outliers-mean_imputed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace" + }, + "start_time": "2025-12-19T10:21:48.422273", + "version": "2.6.0" } - }, - "outputs": [], - "source": [ - "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", - "\n", - "# parquet\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")) \n", - "write_parquet(reporting_rate_dataset, file_path)\n", - "log_msg(glue(\"Exported : {file_path}\"))\n", - "\n", - "# csv\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\"))\n", - "write.csv(reporting_rate_dataset, file_path, row.names = FALSE)\n", - "log_msg(glue(\"Exported : {file_path}\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" }, - "papermill": { - "default_parameters": {}, - "duration": 94.192072, - "end_time": "2025-12-19T10:23:22.614345", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/papermill_outputs/snt_dhis2_reporting_rate_dataset_OUTPUT_2025-12-19_102148.ipynb", - "parameters": { - "ROUTINE_FILE": "NER_routine_outliers-mean_imputed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace" - }, - "start_time": "2025-12-19T10:21:48.422273", - "version": "2.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/snt_dhis2_incidence/pipeline.py b/snt_dhis2_incidence/pipeline.py index d4d6f38..02a4063 100644 --- a/snt_dhis2_incidence/pipeline.py +++ b/snt_dhis2_incidence/pipeline.py @@ -121,12 +121,9 @@ def snt_dhis2_incidence( "Pregnant women": "PREGNANT_WOMAN", } - notebook_routine_data_choice = ( - "raw_without_outliers" if routine_data_choice == "outliers_removed" else routine_data_choice - ) notebook_params = { "N1_METHOD": n1_method, - "ROUTINE_DATA_CHOICE": notebook_routine_data_choice, + "ROUTINE_DATA_CHOICE": routine_data_choice, "USE_CSB_DATA": use_csb_data, "USE_ADJUSTED_POPULATION": use_adjusted_population, "DISAGGREGATION_SELECTION": ( diff --git a/snt_dhis2_reporting_rate_dataset/pipeline.py b/snt_dhis2_reporting_rate_dataset/pipeline.py index f6052fc..b52c32c 100644 --- a/snt_dhis2_reporting_rate_dataset/pipeline.py +++ b/snt_dhis2_reporting_rate_dataset/pipeline.py @@ -72,7 +72,7 @@ def snt_dhis2_reporting_rate_dataset( country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] if not run_report_only: - routine_file_candidates = resolve_routine_file_candidates( + routine_file = resolve_routine_filename( country_code=country_code, routine_data_choice=routine_data_choice ) if routine_data_choice == "raw": @@ -80,18 +80,9 @@ def snt_dhis2_reporting_rate_dataset( else: ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] - routine_file = next( - ( - filename - for filename in routine_file_candidates - if dataset_file_exists(ds_id=ds_outliers_id, filename=filename) - ), - None, - ) - if routine_file is None: + if not dataset_file_exists(ds_id=ds_outliers_id, filename=routine_file): current_run.log_warning( - f"None of the expected routine files were found in dataset {ds_outliers_id}: " - f"{routine_file_candidates}. " + f"Routine file {routine_file} was not found in dataset {ds_outliers_id}. " "Perhaps the outliers-imputation pipeline has not been run yet. Processing cannot continue." ) return @@ -143,32 +134,16 @@ def snt_dhis2_reporting_rate_dataset( raise -def resolve_routine_file_candidates(country_code: str, routine_data_choice: str) -> list[str]: - """Returns ordered candidate filenames for a routine data choice.""" +def resolve_routine_filename(country_code: str, routine_data_choice: str) -> str: + """Returns the canonical routine filename for a routine data choice.""" if routine_data_choice == "raw": - return [f"{country_code}_routine.parquet"] + return f"{country_code}_routine.parquet" if routine_data_choice == "imputed": - return [ - f"{country_code}_routine_outliers_imputed.parquet", - f"{country_code}_routine_outliers-mean_imputed.parquet", - f"{country_code}_routine_outliers-median_imputed.parquet", - f"{country_code}_routine_outliers-iqr_imputed.parquet", - f"{country_code}_routine_outliers-trend_imputed.parquet", - f"{country_code}_routine_outliers-mg-partial_imputed.parquet", - f"{country_code}_routine_outliers-mg-complete_imputed.parquet", - ] + return f"{country_code}_routine_outliers_imputed.parquet" if routine_data_choice == "outliers_removed": - return [ - f"{country_code}_routine_outliers_removed.parquet", - f"{country_code}_routine_outliers-mean_removed.parquet", - f"{country_code}_routine_outliers-median_removed.parquet", - f"{country_code}_routine_outliers-iqr_removed.parquet", - f"{country_code}_routine_outliers-trend_removed.parquet", - f"{country_code}_routine_outliers-mg-partial_removed.parquet", - f"{country_code}_routine_outliers-mg-complete_removed.parquet", - ] + return f"{country_code}_routine_outliers_removed.parquet" raise ValueError(f"Unknown routine data choice: {routine_data_choice}") From 0390973cf741e9ab8cd55b45b24bb27c7bb8e881 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 20 Mar 2026 13:38:51 +0100 Subject: [PATCH 3/3] update for outliers imputation --- .../code/snt_dhis2_incidence.ipynb | 2 +- .../snt_dhis2_incidence_NER.ipynb | 2 +- .../snt_dhis2_incidence_report.ipynb | 3062 ++++++++--------- ...snt_dhis2_reporting_rate_dataelement.ipynb | 2380 +++++++------ .../snt_dhis2_reporting_rate_dataset.ipynb | 2316 ++++++------- ..._dhis2_reporting_rate_dataset_report.ipynb | 2580 +++++++------- .../pipeline.py | 90 +- snt_dhis2_reporting_rate_dataset/pipeline.py | 1 + 8 files changed, 5197 insertions(+), 5236 deletions(-) diff --git a/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb b/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb index 9b0ff8e..93dd271 100644 --- a/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb +++ b/pipelines/snt_dhis2_incidence/code/snt_dhis2_incidence.ipynb @@ -303,7 +303,7 @@ " # Check if the error message indicates that the file does not exist \n", " if (grepl(\"does not exist\", conditionMessage(e), ignore.case = TRUE)) { \n", " msg <- paste0(\"[ERROR] File not found! 🛑 The file `\", routine_filename, \"` does not exist in `\", \n", - " routine_dataset_name, \"`. To generate it, execute the pipeline `DHIS2 Outliers Removal and Imputation`, choosing the appropriate method.\")\n", + " routine_dataset_name, \"`. To generate it, execute the pipeline `DHIS2 Outliers Removal and Imputation` so canonical outputs are available (`*_routine_outliers_imputed.parquet` or `*_routine_outliers_removed.parquet`).\")\n", " } else {\n", " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file for: \", COUNTRY_CODE, \". [ERROR DETAILS] \" , conditionMessage(e))\n", " } \n", diff --git a/pipelines/snt_dhis2_incidence/country_specific/snt_dhis2_incidence_NER.ipynb b/pipelines/snt_dhis2_incidence/country_specific/snt_dhis2_incidence_NER.ipynb index 9b0ff8e..93dd271 100644 --- a/pipelines/snt_dhis2_incidence/country_specific/snt_dhis2_incidence_NER.ipynb +++ b/pipelines/snt_dhis2_incidence/country_specific/snt_dhis2_incidence_NER.ipynb @@ -303,7 +303,7 @@ " # Check if the error message indicates that the file does not exist \n", " if (grepl(\"does not exist\", conditionMessage(e), ignore.case = TRUE)) { \n", " msg <- paste0(\"[ERROR] File not found! 🛑 The file `\", routine_filename, \"` does not exist in `\", \n", - " routine_dataset_name, \"`. To generate it, execute the pipeline `DHIS2 Outliers Removal and Imputation`, choosing the appropriate method.\")\n", + " routine_dataset_name, \"`. To generate it, execute the pipeline `DHIS2 Outliers Removal and Imputation` so canonical outputs are available (`*_routine_outliers_imputed.parquet` or `*_routine_outliers_removed.parquet`).\")\n", " } else {\n", " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file for: \", COUNTRY_CODE, \". [ERROR DETAILS] \" , conditionMessage(e))\n", " } \n", diff --git a/pipelines/snt_dhis2_incidence/reporting/snt_dhis2_incidence_report.ipynb b/pipelines/snt_dhis2_incidence/reporting/snt_dhis2_incidence_report.ipynb index 3022073..929f272 100644 --- a/pipelines/snt_dhis2_incidence/reporting/snt_dhis2_incidence_report.ipynb +++ b/pipelines/snt_dhis2_incidence/reporting/snt_dhis2_incidence_report.ipynb @@ -1,1537 +1,1535 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "9f017694-ec13-4584-80cb-c1c529863ec1", - "metadata": {}, - "source": [ - "# Estimations de l’incidence brute et ajustée" - ] - }, - { - "cell_type": "markdown", - "id": "a723a172", - "metadata": {}, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f5b30ae1-805f-444f-a34c-049b590e04b7", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "# DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2')\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2', 'incidence') # store the output of the pipeline (only final results)\n", - "INTERMEDIATE_DATA_PATH <- file.path(DATA_PATH, \"intermediate_results\") # intermediate results for reporting nb or else, NOT for OH Dataset!\n", - "FIGURES_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_incidence/reporting/outputs/figures\")\n", - "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "# Load palettes\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\n", - " \"dplyr\", \n", - " \"tidyr\", \n", - " \"ggplot2\", \n", - " \"stringr\", \n", - " \"arrow\", \n", - " \"sf\", \n", - " \"reticulate\" \n", - " )\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d10d93d8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Required environment for the sf packages\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "markdown", - "id": "1134f15e", - "metadata": {}, - "source": [ - "#### Load `SNT_config`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb52fb2f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ef5b67d-f94c-40db-93ff-c5fdbdee134f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration variables\n", - "DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_INCIDENCE\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "\n", - "# Cols to select from pyramid\n", - "ADMIN_1_NAME <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2_NAME <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "ADMIN_1_ID <- str_replace(ADMIN_1_NAME, \"_NAME\", \"_ID\")\n", - "ADMIN_2_ID <- str_replace(ADMIN_2_NAME, \"_NAME\", \"_ID\")" - ] - }, - { - "cell_type": "markdown", - "id": "18d18029", - "metadata": {}, - "source": [ - "#### Load `SNT_metadata`\n", - "This is needed for the correct use of palettes and categories (breaks, or scale)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e406d8e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT metadata\n", - "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04c6cab4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Handle situation in which metadata is not correctly loaded (for example if the specific node is missing in the json file)\n", - "if (is.null(metadata_json$INCIDENCE_CRUDE$SCALE)) {\n", - " log_msg(\"Warning: Incidence (crude) scale break values cannot be loaded from SNT_metadata.json because the node $INCIDENCE_CRUDE$SCALE is missing.\", \"warning\")\n", - "} else {\n", - " break_vals <- jsonlite::fromJSON(metadata_json$INCIDENCE_CRUDE$SCALE)\n", - " log_msg(paste0(\"Incidence (crude) scale break values loaded from SNT_metadata.json : \", paste(break_vals, collapse = \", \")))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "37ef2734", - "metadata": {}, - "source": [ - "## 2. Load data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3916fbc6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "DATASET_INCIDENCE <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_INCIDENCE" - ] - }, - { - "cell_type": "markdown", - "id": "0ddca5f6", - "metadata": {}, - "source": [ - "#### 2.0. Parameters: `parameters_json`\n", - "This is how we keep track of parameters choices. File stored in OH Dataset (same as main output).\n", - "\n", - "**Replaces the papermill-injected parameters which used to be in the first cell!**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f7d1a18", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# This is output of the main code notebook! => DATASET_INCIDENCE\n", - "parameters <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_INCIDENCE, paste0(COUNTRY_CODE, \"_parameters.json\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "glimpse(parameters)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b1bb741", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "N1_METHOD <- parameters$N1_METHOD\n", - "ROUTINE_DATA_CHOICE <- parameters$ROUTINE_DATA_CHOICE\n", - "OUTLIER_DETECTION_METHOD <- parameters$OUTLIER_DETECTION_METHOD\n", - "USE_CSB_DATA <- parameters$USE_CSB_DATA\n", - "USE_ADJUSTED_POPULATION <- parameters$USE_ADJUSTED_POPULATION\n", - "DISAGGREGATION_SELECTION <- parameters$DISAGGREGATION_SELECTION" - ] - }, - { - "cell_type": "markdown", - "id": "76f0c98a", - "metadata": {}, - "source": [ - "#### 2.1. Shapes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64215532", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# import DHIS2 shapes data\n", - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })" - ] - }, - { - "cell_type": "markdown", - "id": "59b5b68c", - "metadata": {}, - "source": [ - "#### 2.2. Pyramid\n", - "This is needed to add back the `*_NAME` cols to the main data
\n", - "(Because normally we only output tables with the `*_ID` cols)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0c17f43-8825-44cf-a4a1-7ce0c25cee17", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72af5723", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Keep only relevant cols and rename them to match incidence data\n", - "pyramid <- pyramid_data %>%\n", - " select(\n", - " ADM1_ID = all_of(ADMIN_1_ID),\n", - " ADM1_NAME = all_of(ADMIN_1_NAME), \n", - " ADM2_ID = all_of(ADMIN_2_ID),\n", - " ADM2_NAME = all_of(ADMIN_2_NAME)\n", - " ) %>%\n", - " distinct()\n", - "\n", - "head(pyramid, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "866bca3b", - "metadata": {}, - "source": [ - "#### 2.3. Monthly cases\n", - "Needed for coherence checks:\n", - "* **TPR** at monthly level over time \n", - " * Explain changes (or lack thereof) between Crude and Adj1\n", - " * Useful to monitor resistance (or testing behaviour ... ?)\n", - "* **Reporting Rate**\n", - " * Explain changes (or lack thereof) between Adj1 and Adj2\n", - "* **Indicators** coherence:\n", - " * SUSP > TEST\n", - " * TEST > CONF\n", - " * ... (check and add more ...)\n", - "\n", - "\n", - "⚠️ Note: **Import** from 📁`/data/` folder (not OH Dataset)
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "37b613d1", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Import monthly_cases data from \n", - "\n", - "# file_path <- file.path(DATA_PATH, \"incidence\", paste0(COUNTRY_CODE, \"_monthly_cases.parquet\"))\n", - "file_path <- file.path(INTERMEDIATE_DATA_PATH, paste0(COUNTRY_CODE, \"_monthly_cases.parquet\"))\n", - "monthly_cases <- arrow::read_parquet(file_path)\n", - "log_msg(paste0(\"Monthly cases data loaded from : \", file_path))\n", - "\n", - "dim(monthly_cases)\n", - "head(monthly_cases, 3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bdd78b96", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Add _NAME cols by joining with pyramid_data\n", - "monthly_cases <- left_join(monthly_cases, pyramid, by = join_by(ADM1_ID, ADM2_ID))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93c7ac24", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(monthly_cases, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "605af751", - "metadata": {}, - "source": [ - "#### 2.4. Yearly Incidence\n", - "Currently, **each execution of the Incidence pipeline adds a new file to the OH Dataset**, where the filename stores the choice of parameters used.
\n", - "This introduces the issue of having to chose the correct file to import.\n", - "\n", - "For this, we need to **resolve the correct filename**, based on:\n", - "1. Pipeline paramters (injected here as well): `COUNTRY_CODE`, `ROUTINE_DATA_CHOICE`\n", - "2. Context-derived parameter (based on filename of what available in Dataset): `REPORTING_RATE_METHOD`" - ] - }, - { - "cell_type": "markdown", - "id": "df00062c", - "metadata": {}, - "source": [ - "**Note**: `REPORTING_RATE_METHOD` this is NOT a parameter!
\n", - "The method is derived based on what is available in the dataset `config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cce0c1bf", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Define dataset and file names (based on parameter)\n", - "rr_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "file_name_de <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", - "file_name_ds <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", - "\n", - "# Determine REPORTING_RATE_METHOD based on available file names in the dataset (without loading files)\n", - "dataset_last_version <- openhexa$workspace$get_dataset(rr_dataset_name)$latest_version\n", - "files_iter <- dataset_last_version$files\n", - "\n", - "files <- list()\n", - "repeat {\n", - " file <- tryCatch(\n", - " py_to_r(iter_next(files_iter)),\n", - " error = function(e) NULL\n", - " )\n", - " if (is.null(file)) break\n", - " files <- append(files, list(file))\n", - "}\n", - "\n", - "filenames <- sapply(files, function(f) f$filename)\n", - "\n", - "if (file_name_de %in% filenames) {\n", - " REPORTING_RATE_METHOD <- \"dataelement\"\n", - "} else if (file_name_ds %in% filenames) {\n", - " REPORTING_RATE_METHOD <- \"dataset\"\n", - "} else {\n", - " stop(glue(\"[ERROR] Neither reporting rate file found for: {COUNTRY_CODE}\"))\n", - "}\n", - "\n", - "log_msg(paste0(\"Determined REPORTING_RATE_METHOD: \", REPORTING_RATE_METHOD))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19f87fa2", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "filename_to_import <- glue::glue(\n", - " \"{COUNTRY_CODE}_incidence.parquet\"\n", - " )\n", - "\n", - "log_msg(paste0(\"Importing yearly incidence data from file : \", filename_to_import))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cad08af7-9233-4138-91b8-5b82ed546cec", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "yearly_incidence <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, filename_to_import) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "dim(yearly_incidence)\n", - "head(yearly_incidence, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "e2098887", - "metadata": {}, - "source": [ - "## Plot settings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a45ada9", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Define suffix for exporting final outputs (preserve selected disaggregation in filename)\n", - "DISAGGREGATION_SELECTION_SUFFIX <- ifelse(is.null(DISAGGREGATION_SELECTION), \"TOTAL\", DISAGGREGATION_SELECTION)" - ] - }, - { - "cell_type": "markdown", - "id": "d9296a78", - "metadata": {}, - "source": [ - "### 🎨 Dynamic categories and color assignement" - ] - }, - { - "cell_type": "markdown", - "id": "641a42ec", - "metadata": {}, - "source": [ - "##### 1. Define breaks and labels" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5851885d", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "9f017694-ec13-4584-80cb-c1c529863ec1", + "metadata": {}, + "source": [ + "# Estimations de l’incidence brute et ajustée" + ] + }, + { + "cell_type": "markdown", + "id": "a723a172", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5b30ae1-805f-444f-a34c-049b590e04b7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "# DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2')\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2', 'incidence') # store the output of the pipeline (only final results)\n", + "INTERMEDIATE_DATA_PATH <- file.path(DATA_PATH, \"intermediate_results\") # intermediate results for reporting nb or else, NOT for OH Dataset!\n", + "FIGURES_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_incidence/reporting/outputs/figures\")\n", + "\n", + "# load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Load palettes\n", + "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", + "\n", + "# List required packages \n", + "required_packages <- c(\n", + " \"dplyr\", \n", + " \"tidyr\", \n", + " \"ggplot2\", \n", + " \"stringr\", \n", + " \"arrow\", \n", + " \"sf\", \n", + " \"reticulate\" \n", + " )\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d10d93d8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Required environment for the sf packages\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] + }, + { + "cell_type": "markdown", + "id": "1134f15e", + "metadata": {}, + "source": [ + "#### Load `SNT_config`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb52fb2f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ef5b67d-f94c-40db-93ff-c5fdbdee134f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration variables\n", + "DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_INCIDENCE\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "\n", + "# Cols to select from pyramid\n", + "ADMIN_1_NAME <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2_NAME <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "ADMIN_1_ID <- str_replace(ADMIN_1_NAME, \"_NAME\", \"_ID\")\n", + "ADMIN_2_ID <- str_replace(ADMIN_2_NAME, \"_NAME\", \"_ID\")" + ] + }, + { + "cell_type": "markdown", + "id": "18d18029", + "metadata": {}, + "source": [ + "#### Load `SNT_metadata`\n", + "This is needed for the correct use of palettes and categories (breaks, or scale)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e406d8e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT metadata\n", + "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading metadata\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04c6cab4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Handle situation in which metadata is not correctly loaded (for example if the specific node is missing in the json file)\n", + "if (is.null(metadata_json$INCIDENCE_CRUDE$SCALE)) {\n", + " log_msg(\"Warning: Incidence (crude) scale break values cannot be loaded from SNT_metadata.json because the node $INCIDENCE_CRUDE$SCALE is missing.\", \"warning\")\n", + "} else {\n", + " break_vals <- jsonlite::fromJSON(metadata_json$INCIDENCE_CRUDE$SCALE)\n", + " log_msg(paste0(\"Incidence (crude) scale break values loaded from SNT_metadata.json : \", paste(break_vals, collapse = \", \")))\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "37ef2734", + "metadata": {}, + "source": [ + "## 2. Load data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3916fbc6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "DATASET_INCIDENCE <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_INCIDENCE" + ] + }, + { + "cell_type": "markdown", + "id": "0ddca5f6", + "metadata": {}, + "source": [ + "#### 2.0. Parameters: `parameters_json`\n", + "This is how we keep track of parameters choices. File stored in OH Dataset (same as main output).\n", + "\n", + "**Replaces the papermill-injected parameters which used to be in the first cell!**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f7d1a18", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# This is output of the main code notebook! => DATASET_INCIDENCE\n", + "parameters <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_INCIDENCE, paste0(COUNTRY_CODE, \"_parameters.json\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "glimpse(parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b1bb741", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "N1_METHOD <- parameters$N1_METHOD\n", + "ROUTINE_DATA_CHOICE <- parameters$ROUTINE_DATA_CHOICE\n", + "USE_CSB_DATA <- parameters$USE_CSB_DATA\n", + "USE_ADJUSTED_POPULATION <- parameters$USE_ADJUSTED_POPULATION\n", + "DISAGGREGATION_SELECTION <- parameters$DISAGGREGATION_SELECTION" + ] + }, + { + "cell_type": "markdown", + "id": "76f0c98a", + "metadata": {}, + "source": [ + "#### 2.1. Shapes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64215532", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# import DHIS2 shapes data\n", + "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })" + ] + }, + { + "cell_type": "markdown", + "id": "59b5b68c", + "metadata": {}, + "source": [ + "#### 2.2. Pyramid\n", + "This is needed to add back the `*_NAME` cols to the main data
\n", + "(Because normally we only output tables with the `*_ID` cols)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0c17f43-8825-44cf-a4a1-7ce0c25cee17", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72af5723", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Keep only relevant cols and rename them to match incidence data\n", + "pyramid <- pyramid_data %>%\n", + " select(\n", + " ADM1_ID = all_of(ADMIN_1_ID),\n", + " ADM1_NAME = all_of(ADMIN_1_NAME), \n", + " ADM2_ID = all_of(ADMIN_2_ID),\n", + " ADM2_NAME = all_of(ADMIN_2_NAME)\n", + " ) %>%\n", + " distinct()\n", + "\n", + "head(pyramid, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "866bca3b", + "metadata": {}, + "source": [ + "#### 2.3. Monthly cases\n", + "Needed for coherence checks:\n", + "* **TPR** at monthly level over time \n", + " * Explain changes (or lack thereof) between Crude and Adj1\n", + " * Useful to monitor resistance (or testing behaviour ... ?)\n", + "* **Reporting Rate**\n", + " * Explain changes (or lack thereof) between Adj1 and Adj2\n", + "* **Indicators** coherence:\n", + " * SUSP > TEST\n", + " * TEST > CONF\n", + " * ... (check and add more ...)\n", + "\n", + "\n", + "⚠️ Note: **Import** from 📁`/data/` folder (not OH Dataset)
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37b613d1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Import monthly_cases data from \n", + "\n", + "# file_path <- file.path(DATA_PATH, \"incidence\", paste0(COUNTRY_CODE, \"_monthly_cases.parquet\"))\n", + "file_path <- file.path(INTERMEDIATE_DATA_PATH, paste0(COUNTRY_CODE, \"_monthly_cases.parquet\"))\n", + "monthly_cases <- arrow::read_parquet(file_path)\n", + "log_msg(paste0(\"Monthly cases data loaded from : \", file_path))\n", + "\n", + "dim(monthly_cases)\n", + "head(monthly_cases, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdd78b96", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add _NAME cols by joining with pyramid_data\n", + "monthly_cases <- left_join(monthly_cases, pyramid, by = join_by(ADM1_ID, ADM2_ID))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93c7ac24", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(monthly_cases, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "605af751", + "metadata": {}, + "source": [ + "#### 2.4. Yearly Incidence\n", + "Currently, **each execution of the Incidence pipeline adds a new file to the OH Dataset**, where the filename stores the choice of parameters used.
\n", + "This introduces the issue of having to chose the correct file to import.\n", + "\n", + "For this, we need to **resolve the correct filename**, based on:\n", + "1. Pipeline paramters (injected here as well): `COUNTRY_CODE`, `ROUTINE_DATA_CHOICE`\n", + "2. Context-derived parameter (based on filename of what available in Dataset): `REPORTING_RATE_METHOD`" + ] + }, + { + "cell_type": "markdown", + "id": "df00062c", + "metadata": {}, + "source": [ + "**Note**: `REPORTING_RATE_METHOD` this is NOT a parameter!
\n", + "The method is derived based on what is available in the dataset `config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cce0c1bf", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Define dataset and file names (based on parameter)\n", + "rr_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", + "file_name_de <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", + "file_name_ds <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", + "\n", + "# Determine REPORTING_RATE_METHOD based on available file names in the dataset (without loading files)\n", + "dataset_last_version <- openhexa$workspace$get_dataset(rr_dataset_name)$latest_version\n", + "files_iter <- dataset_last_version$files\n", + "\n", + "files <- list()\n", + "repeat {\n", + " file <- tryCatch(\n", + " py_to_r(iter_next(files_iter)),\n", + " error = function(e) NULL\n", + " )\n", + " if (is.null(file)) break\n", + " files <- append(files, list(file))\n", + "}\n", + "\n", + "filenames <- sapply(files, function(f) f$filename)\n", + "\n", + "if (file_name_de %in% filenames) {\n", + " REPORTING_RATE_METHOD <- \"dataelement\"\n", + "} else if (file_name_ds %in% filenames) {\n", + " REPORTING_RATE_METHOD <- \"dataset\"\n", + "} else {\n", + " stop(glue(\"[ERROR] Neither reporting rate file found for: {COUNTRY_CODE}\"))\n", + "}\n", + "\n", + "log_msg(paste0(\"Determined REPORTING_RATE_METHOD: \", REPORTING_RATE_METHOD))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19f87fa2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "filename_to_import <- glue::glue(\n", + " \"{COUNTRY_CODE}_incidence.parquet\"\n", + " )\n", + "\n", + "log_msg(paste0(\"Importing yearly incidence data from file : \", filename_to_import))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cad08af7-9233-4138-91b8-5b82ed546cec", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "yearly_incidence <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, filename_to_import) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "dim(yearly_incidence)\n", + "head(yearly_incidence, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "e2098887", + "metadata": {}, + "source": [ + "## Plot settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a45ada9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Define suffix for exporting final outputs (preserve selected disaggregation in filename)\n", + "DISAGGREGATION_SELECTION_SUFFIX <- ifelse(is.null(DISAGGREGATION_SELECTION), \"TOTAL\", DISAGGREGATION_SELECTION)" + ] + }, + { + "cell_type": "markdown", + "id": "d9296a78", + "metadata": {}, + "source": [ + "### 🎨 Dynamic categories and color assignement" + ] + }, + { + "cell_type": "markdown", + "id": "641a42ec", + "metadata": {}, + "source": [ + "##### 1. Define breaks and labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5851885d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Safety code to avoid breaking if nothings is fund in json_metadata\n", + "if (!exists(\"break_vals\") || is.null(break_vals) || length(break_vals) == 0) {\n", + " log_msg(\"[WARNING] No break values found in SNT_metadata.json for INCIDENCE_CRUDE$SCALE. Using default values.\", \"warning\")\n", + " break_vals <- c(100, 250, 450, 1000)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03806bba", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create the full set of cut points (0 to Infinity)\n", + "full_breaks <- c(0, break_vals, Inf)\n", + "\n", + "# Create dynamic labels\n", + "labels <- c(\n", + " paste0(\"< \", break_vals[1]), # First label\n", + " paste0(break_vals[-length(break_vals)], \"-\", break_vals[-1]), # Middle labels\n", + " paste0(\"> \", break_vals[length(break_vals)]) # Last label\n", + ")\n", + "\n", + "# Check\n", + "labels" + ] + }, + { + "cell_type": "markdown", + "id": "a8f168e2", + "metadata": {}, + "source": [ + "##### 3. Pick appropriate palette" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c2d0136", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Count nr of breaks\n", + "nr_of_colors <- length(labels)\n", + "\n", + "# nr_of_colors\n", + "palette_to_use <- get_range_from_count(nr_of_colors)\n", + "\n", + "# # Need to make palettes as named vectors so that scale_color_manual() and scale_fill_manual() can use them properly\n", + "# # Note: need to reverse order of labels to match the palette order \"meaning\" (red \"\" should correcpond to lowest value)\n", + "# names(palette_to_use) <- rev(labels)\n", + "\n", + "print(palette_to_use)\n" + ] + }, + { + "cell_type": "markdown", + "id": "d765607f", + "metadata": {}, + "source": [ + "#### Define plot size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f9c3af7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "options(repr.plot.width = 20, repr.plot.height = 12)" + ] + }, + { + "cell_type": "markdown", + "id": "1768642c-297e-4a9f-939d-4460c047761b", + "metadata": {}, + "source": [ + "## Coherence checks\n", + "\n", + "See Jira: https://bluesquare.atlassian.net/browse/SNT25-272" + ] + }, + { + "cell_type": "markdown", + "id": "ea339f92", + "metadata": {}, + "source": [ + "#### 1. TPR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a4cb374", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate yearly TPR to be added on top of the monthly TPR plots\n", + "monthly_cases_yearly <- monthly_cases %>%\n", + " group_by(ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR) %>% \n", + " mutate(\n", + " CONF_yearly = sum(CONF, na.rm = TRUE),\n", + " TEST_yearly = sum(TEST, na.rm = TRUE)\n", + " ) %>%\n", + " ungroup() %>%\n", + " mutate(\n", + " TPR_yearly = ifelse(!is.na(CONF_yearly) & !is.na(TEST_yearly) & (TEST_yearly != 0), CONF_yearly / TEST_yearly, 1)\n", + " ) \n", + "\n", + "head(monthly_cases_yearly)" + ] + }, + { + "cell_type": "markdown", + "id": "93ddc4fd", + "metadata": {}, + "source": [ + "##### 1.1. TPR (monthly) over time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d34d2d0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(monthly_cases_yearly) +\n", + "# Monthly TPR lines\n", + " geom_line(\n", + " aes(x = MONTH, y = TPR, group = ADM2_NAME),\n", + " color = \"grey21\",\n", + " alpha = 0.75) +\n", + " facet_grid(\n", + " cols = vars(YEAR), rows = vars(ADM1_NAME),\n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = seq(1,12,1)) +\n", + " scale_y_continuous(labels = scales::percent_format(accuracy = 1L), limits = c(0, 1)) +\n", + " geom_hline(\n", + " yintercept = 0,\n", + " color = \"grey21\",\n", + " linewidth = 0.5\n", + " ) +\n", + " labs(\n", + " title = \"Taux de Positivité des Tests (TPR) pour ADM2 et mois\" ) +\n", + " theme_minimal() +\n", + " theme(\n", + " panel.grid.minor = element_blank(),\n", + " panel.grid.major.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " strip.background = element_rect(fill = \"grey21\"),\n", + " strip.text = element_text(color = \"white\"),\n", + " axis.title.y = element_blank()\n", + " )\n", + "\n", + "ggsave(\n", + " file.path(FIGURES_PATH, glue::glue(\"TPR_monthly_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", + " create.dir = TRUE,\n", + " bg = \"white\",\n", + " units = \"cm\",\n", + " width = 21,\n", + " height = 29.7,\n", + " dpi = 200)" + ] + }, + { + "cell_type": "markdown", + "id": "cf87c6a4", + "metadata": {}, + "source": [ + "##### 1.2. TPR (monthly & yearly) over time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0004755f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "\n", + "# Add layer of yearly TPR on top (actually underneath) of monthly TPR\n", + "\n", + "ggplot(monthly_cases_yearly) +\n", + "# Yearly TPR lines\n", + " geom_line(\n", + " aes(x = MONTH, y = TPR_yearly, group = ADM2_NAME), \n", + " color = \"grey21\",\n", + " alpha = 0.25,\n", + " linewidth = 0.5) +\n", + "# Monthly TPR lines\n", + " geom_line(\n", + " aes(x = MONTH, y = TPR, group = ADM2_NAME),\n", + " color = \"grey21\",\n", + " alpha = 0.75) +\n", + " facet_grid(\n", + " cols = vars(YEAR), rows = vars(ADM1_NAME),\n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = seq(1,12,1)) +\n", + " scale_y_continuous(labels = scales::percent_format(accuracy = 1L), limits = c(0, 1)) +\n", + " geom_hline(\n", + " yintercept = 0,\n", + " color = \"grey21\",\n", + " linewidth = 0.5\n", + " ) +\n", + " labs(\n", + " title = \"Taux de Positivité des Tests (TPR) pour ADM2 at pour mois et année\",\n", + " subtitle = \"Les valeurs agrégées par année sont indiquées comme lignes horizontales.\") +\n", + " theme_minimal() +\n", + " theme(\n", + " panel.grid.minor = element_blank(),\n", + " panel.grid.major.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " strip.background = element_rect(fill = \"grey21\"),\n", + " strip.text = element_text(color = \"white\"),\n", + " axis.title.y = element_blank()\n", + " )\n", + "\n", + "ggsave(\n", + " file.path(FIGURES_PATH, glue::glue(\"TPR_monthly_yearly_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", + " create.dir = TRUE,\n", + " bg = \"white\",\n", + " units = \"cm\",\n", + " width = 21,\n", + " height = 29.7,\n", + " dpi = 200)" + ] + }, + { + "cell_type": "markdown", + "id": "577a047e", + "metadata": {}, + "source": [ + "#### 2. RR\n", + "For more detailas, check **report** notebooks for reporting rate of used method. Possible options:\n", + "* **Dataset**: pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs/**snt_dhis2_reporting_rate_dataset_report**\\_OUTPUT\\_\\*.ipynb\n", + "* **DataElement**: work in progress ...\n", + "\n", + "⚠️⚠️⚠️ **TO DO**: align code here with report notebook pf reporting rate (use \"🎨 NEW dynamic colors & breaks\" approach) ⚠️⚠️⚠️" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "780802fd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Tile plot faceted by YEAR\n", + "ggplot(data = monthly_cases) +\n", + " geom_tile(aes(x = MONTH,\n", + " y = forcats::fct_rev(ADM2_NAME),\n", + " # fill = REPORTING_RATE_CATEGORY\n", + " fill = REPORTING_RATE\n", + " ), \n", + " color = \"white\",\n", + " show.legend = TRUE,\n", + " # Fill NA values with white\n", + " na.rm = FALSE\n", + " ) +\n", + "# scale_fill_manual(\n", + "# values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + "# na.value = \"white\",\n", + "# name = \"Reporting Rate: \"\n", + "# ) +\n", + " scale_fill_viridis_c(\n", + " option = \"viridis\",\n", + " na.value = \"white\",\n", + " name = \"Reporting Rate:\",\n", + " direction = -1\n", + " # labels = scales::percent_format(accuracy = 1L)\n", + " ) +\n", + " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", + " facet_grid(rows = vars(ADM1_NAME), cols = vars(YEAR), \n", + " scales = \"free_y\", space = \"free_y\",\n", + " switch = \"y\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", + " legend.position = \"bottom\",\n", + " legend.key.height = unit(0.25, \"cm\"),\n", + " axis.text.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " panel.grid.minor = element_blank(),\n", + " panel.grid.major = element_blank(),\n", + " strip.placement = \"outside\", \n", + " strip.text = element_text(color = \"white\", face = \"bold\", size = 10),\n", + " strip.background = element_rect(fill = \"grey21\")\n", + " ) +\n", + " guides(fill = guide_legend(nrow = 1))\n", + "\n", + "# Export plot as png\n", + "ggsave(\n", + " file.path(FIGURES_PATH, glue::glue(\"ReportingRate_heatmap_monthly_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", + " create.dir = TRUE,\n", + " bg = \"white\",\n", + " units = \"cm\",\n", + " width = 21,\n", + " height = 29.7,\n", + " dpi = 200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "693ae7a4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check on data completeness for REPORTING RATE data: \n", + "# check how many values of REPORTING_RATE are NA\n", + "na_count <- sum(is.na(monthly_cases$REPORTING_RATE)) \n", + "if (na_count > 0) {\n", + " log_msg(glue(\"⚠️ Warning: Reporting Rate data contains {na_count} missing values (NA) in 'REPORTING_RATE' column.\"), \"warning\")\n", + "} else {\n", + " log_msg(\"✅ Reporting Rate data contains no missing values (NA) in 'REPORTING_RATE' column.\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "0dfbbd7a", + "metadata": {}, + "source": [ + "### 3. Coherence checks on Incidence: Scatter plots\n", + "\n", + "Logic: each level of adjustment should produce values that are greater (or equal) to the previous level.
\n", + "\n", + "Namely:\n", + "* Crude <= Adj1\n", + "* Adj1 <= Adj2\n", + "* Adj2 <= Adj3\n", + "\n", + "Given than Crude, Adj1, Adj2, and Adj3 are calculated by aggregating `CONF`, `N1`, `N2`, and `N3` at ADM2 x YEAR, we can first verify that the relationship between these values is coherent. Namely, check if\n", + "* `CONF` <= `N1`\n", + "* `N1` <= `N2`\n", + "* `N2` <= `N3` " + ] + }, + { + "cell_type": "markdown", + "id": "26460827", + "metadata": {}, + "source": [ + "#### 3.1. Incidence \"metrics\"\n", + "Metrics used to calculate incidence: `CONF`, `N1`, `N2`, (and `N3`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82f9a64b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# CONF vs N1 \n", + "\n", + "# Create warning message if there are CONF values greater than N1\n", + "conf_greater_n1_count <- sum(monthly_cases$CONF > monthly_cases$N1, na.rm = TRUE)\n", + "if (conf_greater_n1_count > 0) {\n", + " warning_text <- glue(\"✘ Warning: There are {conf_greater_n1_count} instances where CONF is greater than N1.\", \"warning\")\n", + "} else {\n", + " warning_text <- \"✔ All CONF values are less than or equal to N1.\"\n", + "}\n", + "\n", + "ggplot(data = monthly_cases) +\n", + " geom_abline(intercept = 0, slope = 1, linetype = \"dashed\", color = \"red\") +\n", + " geom_point(\n", + " aes(\n", + " x = N1,\n", + " y = CONF),\n", + " alpha = 0.5) +\n", + " labs(\n", + " title = \"CONF vs N1\",\n", + " subtitle = \"N1 is expected to be greater or equal to CONF\",\n", + " caption = warning_text\n", + " ) +\n", + " theme_minimal() +\n", + " theme(\n", + " aspect.ratio = 1,\n", + " plot.caption.position = \"plot\",\n", + " plot.caption = element_text(hjust = 0)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7044d1d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# N1 > N2\n", + "\n", + "# Create warning message if there are N1 values greater than N2\n", + "n1_greater_n2_count <- sum(monthly_cases$N1 > monthly_cases$N2, na.rm = TRUE)\n", + "if (n1_greater_n2_count > 0) {\n", + " warning_text <- glue(\"✘ Warning: There are {n1_greater_n2_count} instances where N1 is greater than N2.\", \"warning\")\n", + "} else {\n", + " warning_text <- \"✔ All N1 values are less than or equal to N2.\"\n", + "}\n", + "\n", + "ggplot(data = monthly_cases) +\n", + " geom_abline(intercept = 0, slope = 1, linetype = \"dashed\", color = \"red\") +\n", + " geom_point(\n", + " aes(\n", + " x = N2,\n", + " y = N1),\n", + " alpha = 0.5) +\n", + " labs(title = \"N1 vs N2\",\n", + " subtitle = \"N2 is expected to be greater or equal to N1.\",\n", + " caption = warning_text\n", + " ) +\n", + " theme_minimal() +\n", + " theme(\n", + " aspect.ratio = 1,\n", + " plot.caption.position = \"plot\",\n", + " plot.caption = element_text(hjust = 0)\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "00fc14f7", + "metadata": {}, + "source": [ + "#### 3.2. Incidence values\n", + "Actual (calculated) incidence: Crude, Adj1, Adj2, Adj3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "caff3b82", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add col to mark cases where INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE so that it is displayed in red in the plot\n", + "yearly_incidence_plot <- yearly_incidence %>%\n", + " mutate(\n", + " FLAG_CRUDE_VS_ADJTEST = ifelse(INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE, TRUE, FALSE),\n", + " FLAG_ADJTEST_VS_ADJREP = ifelse(INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING, TRUE, FALSE)\n", + " )\n", + "\n", + "if (\"INCIDENCE_ADJ_CARESEEKING\" %in% colnames(yearly_incidence) && any(!is.na(yearly_incidence$INCIDENCE_ADJ_CARESEEKING))) {\n", + " # Create col to flag cases where INCIDENCE_ADJ_TESTING > INCIDENCE_ADJ_CARESEEKING\n", + " yearly_incidence_plot <- yearly_incidence_plot %>%\n", + " mutate(\n", + " FLAG_ADJTEST_VS_ADJCARE = ifelse(INCIDENCE_ADJ_TESTING > INCIDENCE_ADJ_CARESEEKING, TRUE, FALSE)\n", + " )\n", + "}\n", + "\n", + "head(yearly_incidence_plot)" + ] + }, + { + "cell_type": "markdown", + "id": "78860883", + "metadata": {}, + "source": [ + "##### Crude vs Adj for Testing (Adj1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7847b5e7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create warning message if there are INCIDENCE_CRUDE values greater than INCIDENCE_ADJ_TESTING\n", + "incidence_crude_greater_adj1_count <- sum(yearly_incidence_plot$FLAG_CRUDE_VS_ADJTEST, na.rm = TRUE) \n", + "if (incidence_crude_greater_adj1_count > 0) {\n", + " warning_text <- glue(\"✘ Attention: il y a {incidence_crude_greater_adj1_count} instances où INCIDENCE_CRUDE est supérieure à INCIDENCE_ADJ_TESTING.\", \"warning\")\n", + "} else {\n", + " warning_text <- \"✔ Toutes les valeurs INCIDENCE_CRUDE sont inférieures ou égales à INCIDENCE_ADJ_TESTING.\"\n", + "}\n", + "\n", + "# Plot with points colored based on FLAG_CRUDE_VS_ADJTEST and faceted by YEAR\n", + "ggplot(data = yearly_incidence_plot) +\n", + " geom_abline(intercept = 0, slope = 1, linetype = \"dashed\", color = \"black\") +\n", + " geom_point(\n", + " aes(\n", + " x = INCIDENCE_CRUDE,\n", + " y = INCIDENCE_ADJ_TESTING,\n", + " color = FLAG_CRUDE_VS_ADJTEST),\n", + " alpha = 0.7,\n", + " size = 2) +\n", + " scale_color_manual(\n", + " values = c(\"FALSE\" = \"black\", \"TRUE\" = \"red\")\n", + " ) +\n", + " scale_x_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", + " scale_y_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", + " facet_wrap(vars(YEAR), nrow = 1) +\n", + " labs(\n", + " title = \"INCIDENCE_CRUDE vs INCIDENCE_ADJ_TESTING\",\n", + " subtitle = warning_text,\n", + " caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nTaux de déclaration calculé selon la méthode : {REPORTING_RATE_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", + " ) +\n", + " theme_minimal() +\n", + " theme(\n", + " aspect.ratio = 1,\n", + " legend.position = \"none\",\n", + " strip.text = element_text(face = \"bold\", size = 10),\n", + " panel.grid.minor = element_blank(),\n", + " plot.caption = element_text(size = 7, hjust = 0)\n", + " )\n", + "\n", + "# Export plots as png\n", + "ggsave(\n", + " file.path(FIGURES_PATH, glue::glue(\"Incidence_year_crude_vs_adj_testing_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", + " create.dir = TRUE,\n", + " bg = \"white\",\n", + " units = \"cm\",\n", + " width = 25,\n", + " height = 12.5,\n", + " dpi = 200)" + ] + }, + { + "cell_type": "markdown", + "id": "53a268f8", + "metadata": {}, + "source": [ + "##### Adj for Testing (Adj1) vs Adj for Reporting (Adj2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b25c459", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create warning message if there are INCIDENCE_ADJ_TESTING values greater than INCIDENCE_ADJ_REPORTING\n", + "incidence_adj1_greater_adj2_count <- sum(yearly_incidence_plot$FLAG_ADJTEST_VS_ADJREP, na.rm = TRUE) \n", + "if (incidence_adj1_greater_adj2_count > 0) {\n", + " warning_text <- glue(\"✘ Attention: il y a {incidence_adj1_greater_adj2_count} instances où INCIDENCE_ADJ_TESTING est supérieure à INCIDENCE_ADJ_REPORTING.\", \"warning\")\n", + "} else {\n", + " warning_text <- \"✔ Toutes les valeurs INCIDENCE_ADJ_TESTING sont inférieures ou égales à INCIDENCE_ADJ_REPORTING.\"\n", + "}\n", + "\n", + "# Plot with points colored based on FLAG_ADJTEST_VS_ADJREP and faceted by YEAR\n", + "ggplot(data = yearly_incidence_plot) +\n", + " geom_abline(intercept = 0, slope = 1, linetype = \"dashed\", color = \"black\") +\n", + " geom_point(\n", + " aes(\n", + " x = INCIDENCE_ADJ_TESTING,\n", + " y = INCIDENCE_ADJ_REPORTING,\n", + " color = FLAG_ADJTEST_VS_ADJREP),\n", + " alpha = 0.7,\n", + " size = 2) +\n", + " scale_color_manual(\n", + " values = c(\"FALSE\" = \"black\", \"TRUE\" = \"red\")\n", + " ) +\n", + " scale_x_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", + " scale_y_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", + " facet_wrap(vars(YEAR), nrow = 1) +\n", + " labs(\n", + " title = \"INCIDENCE_ADJ_TESTING vs INCIDENCE_ADJ_REPORTING\",\n", + " subtitle = warning_text,\n", + " caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nTaux de déclaration calculé selon la méthode : {REPORTING_RATE_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", + " ) +\n", + " theme_minimal() +\n", + " theme(\n", + " aspect.ratio = 1,\n", + " legend.position = \"none\",\n", + " strip.text = element_text(face = \"bold\", size = 10),\n", + " panel.grid.minor = element_blank(),\n", + " plot.caption = element_text(size = 7, hjust = 0)\n", + " )\n", + "\n", + "# Export plots as png\n", + "ggsave(\n", + " file.path(FIGURES_PATH, glue::glue(\"Incidence_year_adj_testing_vs_adj_reporting_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", + " create.dir = TRUE,\n", + " bg = \"white\",\n", + " units = \"cm\",\n", + " width = 25,\n", + " height = 12.5,\n", + " dpi = 200)" + ] + }, + { + "cell_type": "markdown", + "id": "b7c80df9", + "metadata": {}, + "source": [ + "##### Adj for Reporting (Adj2) vs Adj for Care Seeking Behaviour (Adj3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35f9e871", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (\"INCIDENCE_ADJ_CARESEEKING\" %in% colnames(yearly_incidence) && any(!is.na(yearly_incidence$INCIDENCE_ADJ_CARESEEKING))) {\n", + "\n", + " # Create warning message if there are INCIDENCE_ADJ_TESTING values greater than INCIDENCE_ADJ_CARESEEKING\n", + " incidence_adj2_greater_adj3_count <- sum(yearly_incidence$INCIDENCE_ADJ_TESTING > yearly_incidence$INCIDENCE_ADJ_CARESEEKING, na.rm = TRUE) \n", + " if (incidence_adj2_greater_adj3_count > 0) {\n", + " warning_text <- glue(\"✘ Attention: il y a {incidence_adj2_greater_adj3_count} instances où INCIDENCE_ADJ_TESTING est supérieure à INCIDENCE_ADJ_CARESEEKING.\", \"warning\")\n", + " } else {\n", + " warning_text <- \"✔ Toutes les valeurs INCIDENCE_ADJ_TESTING sont inférieures ou égales à INCIDENCE_ADJ_CARESEEKING.\"\n", + " }\n", + "\n", + " # Plot with points colored based on FLAG_ADJTEST_VS_ADJREP and faceted by YEAR\n", + " p <- ggplot(data = yearly_incidence_plot) +\n", + " geom_abline(intercept = 0, slope = 1, linetype = \"dashed\", color = \"black\") +\n", + " geom_point(\n", + " aes(\n", + " x = INCIDENCE_ADJ_TESTING,\n", + " y = INCIDENCE_ADJ_CARESEEKING,\n", + " color = FLAG_ADJTEST_VS_ADJCARE),\n", + " alpha = 0.7,\n", + " size = 2) +\n", + " scale_color_manual(\n", + " values = c(\"FALSE\" = \"black\", \"TRUE\" = \"red\")\n", + " ) +\n", + " scale_x_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", + " scale_y_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", + " facet_wrap(vars(YEAR), nrow = 1) +\n", + " labs(\n", + " title = \"INCIDENCE_ADJ_TESTING vs INCIDENCE_ADJ_CARESEEKING\",\n", + " subtitle = warning_text,\n", + " caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nTaux de déclaration calculé selon la méthode : {REPORTING_RATE_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", + " ) +\n", + " theme_minimal() +\n", + " theme(\n", + " aspect.ratio = 1,\n", + " legend.position = \"none\",\n", + " strip.text = element_text(face = \"bold\", size = 10),\n", + " panel.grid.minor = element_blank(),\n", + " plot.caption = element_text(size = 7, hjust = 0)\n", + " )\n", + "\n", + " print(p)\n", + " \n", + " # Export plots as png\n", + " ggsave(\n", + " file.path(FIGURES_PATH, glue::glue(\"Incidence_year_adj_testing_vs_adj_careseeking_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", + " create.dir = TRUE,\n", + " bg = \"white\",\n", + " units = \"cm\",\n", + " width = 25,\n", + " height = 12.5,\n", + " dpi = 200)\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e828c4c0", + "metadata": {}, + "source": [ + "## Incidence du paludisme par année par district sanitaire" + ] + }, + { + "cell_type": "markdown", + "id": "a6ac4fbb", + "metadata": {}, + "source": [ + "#### Puor annee et niveau d'ajustement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3effcced", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 1: Prepare long-form data\n", + "incidence_long <- yearly_incidence %>% # incidence_data\n", + " select(ADM2_ID, YEAR, POPULATION,\n", + " INCIDENCE_CRUDE,\n", + " INCIDENCE_ADJ_TESTING,\n", + " INCIDENCE_ADJ_REPORTING,\n", + " INCIDENCE_ADJ_CARESEEKING) %>%\n", + " pivot_longer(\n", + " cols = starts_with(\"INCIDENCE\"),\n", + " names_to = \"INCIDENCE_TYPE\",\n", + " values_to = \"incidence\"\n", + " ) %>%\n", + " mutate(\n", + " incidence_type_label = case_when(\n", + " INCIDENCE_TYPE == \"INCIDENCE_CRUDE\" ~ \"Brute\",\n", + " INCIDENCE_TYPE == \"INCIDENCE_ADJ_TESTING\" ~ \"Ajustée 1\\n(Test)\",\n", + " INCIDENCE_TYPE == \"INCIDENCE_ADJ_REPORTING\" ~ \"Ajustée 2\\n(Test + Complétude)\",\n", + " INCIDENCE_TYPE == \"INCIDENCE_ADJ_CARESEEKING\" ~ \"Ajustée 3\\n(Test + Complétude + Soins)\",\n", + " TRUE ~ INCIDENCE_TYPE\n", + " )\n", + " )\n", + "\n", + "# Reorder incidence_type_label for plotting\n", + "incidence_long$incidence_type_label <- factor(\n", + "incidence_long$incidence_type_label,\n", + "levels = c(\"Brute\", \"Ajustée 1\\n(Test)\", \"Ajustée 2\\n(Test + Complétude)\", \"Ajustée 3\\n(Test + Complétude + Soins)\")\n", + ")\n", + "# # Remove INCIDENCE_ADJ_CARESEEKING if this is all empty ...\n", + "# filter(!is.na(incidence))\n", + "\n", + "\n", + "# Step 2: Join with shapefile\n", + "map_data_long <- shapes_data %>%\n", + " left_join(incidence_long, by = \"ADM2_ID\")\n", + "\n", + "\n", + "# Step 3: categorize incidence based on break values from metadata\n", + "map_data_long <- map_data_long %>%\n", + " mutate(\n", + " INCIDENCE_CATEGORY = cut(\n", + " incidence,\n", + " breaks = full_breaks,\n", + " labels = labels,\n", + " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", + " include.lowest = TRUE\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c3619af", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "options(repr.plot.width = 20, repr.plot.height = 12)\n", + "\n", + "# Dynamically define subtitle text (handle `is.null(DISAGGREGATION_SELECTION)` so it disaplys TOTAL instead)\n", + "if (is.null(DISAGGREGATION_SELECTION)) { \n", + " subtitle_text <- \"Brute et ajustée selon les étapes OMS.\\nAucune désagrégation spécifique sélectionnée.\"\n", + "} else {\n", + " subtitle_text <- glue::glue(\"Brute et ajustée selon les étapes OMS.\\nDésagrégation utilisée: {DISAGGREGATION_SELECTION}.\")\n", + "}\n", + "\n", + "# Plot maps faceted by incidence type and year\n", + "ggplot(map_data_long) +\n", + " geom_sf(aes(fill = INCIDENCE_CATEGORY), color = \"white\", size = 0.2) +\n", + " facet_grid(\n", + " rows = vars(incidence_type_label),\n", + " cols = vars(YEAR)\n", + " ) +\n", + " scale_fill_manual(values = palette_to_use, name = \"Incidence (pour 1000)\") +\n", + " labs(\n", + " title = \"Incidence annuelle du paludisme par district sanitaire\",\n", + " subtitle = subtitle_text,\n", + " caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nTaux de déclaration calculé selon la méthode : {REPORTING_RATE_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", + " ) +\n", + " theme_minimal(base_size = 14) +\n", + " theme(\n", + " strip.text = element_text(face = \"bold\", size = 12),\n", + " plot.title = element_text(face = \"bold\", size = 16),\n", + " plot.subtitle = element_text(size = 13),\n", + " plot.caption = element_text(size = 7, hjust = 0),\n", + " legend.position = \"right\",\n", + " legend.justification = \"top\",\n", + " panel.grid.major = element_blank(),\n", + " panel.grid.minor = element_blank(),\n", + " axis.text = element_blank(),\n", + " axis.ticks = element_blank(),\n", + " )\n", + "\n", + "ggsave(\n", + " file.path(FIGURES_PATH, \n", + " glue::glue(\"Incidence_faceted_year_adjustment_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", + " create.dir = TRUE,\n", + " units = \"cm\",\n", + " width = 31,\n", + " height = 31,\n", + " dpi = 200\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "902b126a", + "metadata": {}, + "source": [ + "#### Moyenne annuelle (toutes années confondues)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6acf093", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Summarize incidence_long by computing mean incidence per INCIDENCE_TYPE and ADM2_ID across all years\n", + "incidence_long_mean <- incidence_long %>% \n", + "select(-POPULATION) %>%\n", + "# Added 20260128\n", + "group_by(ADM2_ID, INCIDENCE_TYPE,\tincidence_type_label) |>\n", + "summarise(\n", + " across(starts_with(\"INCIDENCE\"), ~mean(., na.rm = TRUE)), # 🔍 pox PROBLEM here: if missing data for RR -> sum of N2 by YEAR is smaller than the sum of N1 !\n", + " .groups = \"drop\"\n", + " ) \n", + "\n", + "# Step 2: Join with shapefile\n", + "map_data_long_mean <- shapes_data %>%\n", + " left_join(incidence_long_mean, by = \"ADM2_ID\")\n", + "\n", + "\n", + "# Step 3: categorize incidence based on break values from metadata\n", + "map_data_long_mean <- map_data_long_mean %>%\n", + " mutate(\n", + " INCIDENCE_CATEGORY = cut(\n", + " incidence,\n", + " breaks = full_breaks,\n", + " labels = labels,\n", + " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", + " include.lowest = TRUE\n", + " )\n", + " )\n", + "\n", + "# head(map_data_long_mean)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e9ddfcc", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "subtitle_text_mean <- if (is.null(DISAGGREGATION_SELECTION)) {\n", + " \"Moyenne annuelle (toutes années confondues).\\nAucune désagrégation spécifique sélectionnée.\"\n", + "} else {\n", + " glue::glue(\"Moyenne annuelle (toutes années confondues).\\nDésagrégation utilisée: {DISAGGREGATION_SELECTION}.\")\n", + "}\n", + "\n", + "# Plot maps faceted by incidence type\n", + "ggplot(map_data_long_mean) +\n", + " geom_sf(aes(fill = INCIDENCE_CATEGORY), color = \"white\", size = 0.2) +\n", + " facet_wrap(\n", + " ~incidence_type_label,\n", + " nrow = 1\n", + " ) +\n", + " scale_fill_manual(values = palette_to_use, name = \"Incidence (pour 1000)\") +\n", + " labs(\n", + " title = \"Incidence moyenne du paludisme par district sanitaire\",\n", + " subtitle = subtitle_text_mean,\n", + " caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nTaux de déclaration calculé selon la méthode : {REPORTING_RATE_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", + " ) +\n", + " theme_minimal(base_size = 14) +\n", + " theme(\n", + " strip.text = element_text(face = \"bold\", size = 12),\n", + " plot.title = element_text(face = \"bold\", size = 16),\n", + " plot.subtitle = element_text(size = 13),\n", + " plot.caption = element_text(size = 7, hjust = 0),\n", + " legend.position = \"right\",\n", + " legend.justification = \"top\",\n", + " panel.grid.major = element_blank(),\n", + " panel.grid.minor = element_blank(),\n", + " axis.text = element_blank(),\n", + " axis.ticks = element_blank(),\n", + " )\n", + "\n", + "\n", + "# Export plots as png\n", + "YEAR_RANGE <- paste0(min(yearly_incidence$YEAR), \"-\", max(yearly_incidence$YEAR))\n", + "ggsave(\n", + " file.path(FIGURES_PATH, \n", + " glue::glue(\"Incidence_faceted_adjustment_{DISAGGREGATION_SELECTION_SUFFIX}_mean-{YEAR_RANGE}.png\")),\n", + " create.dir = TRUE,\n", + " bg = \"white\",\n", + " units = \"cm\",\n", + " width = 41,\n", + " height = 21,\n", + " dpi = 200\n", + ")" + ] } - }, - "outputs": [], - "source": [ - "# Safety code to avoid breaking if nothings is fund in json_metadata\n", - "if (!exists(\"break_vals\") || is.null(break_vals) || length(break_vals) == 0) {\n", - " log_msg(\"[WARNING] No break values found in SNT_metadata.json for INCIDENCE_CRUDE$SCALE. Using default values.\", \"warning\")\n", - " break_vals <- c(100, 250, 450, 1000)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03806bba", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create the full set of cut points (0 to Infinity)\n", - "full_breaks <- c(0, break_vals, Inf)\n", - "\n", - "# Create dynamic labels\n", - "labels <- c(\n", - " paste0(\"< \", break_vals[1]), # First label\n", - " paste0(break_vals[-length(break_vals)], \"-\", break_vals[-1]), # Middle labels\n", - " paste0(\"> \", break_vals[length(break_vals)]) # Last label\n", - ")\n", - "\n", - "# Check\n", - "labels" - ] - }, - { - "cell_type": "markdown", - "id": "a8f168e2", - "metadata": {}, - "source": [ - "##### 3. Pick appropriate palette" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1c2d0136", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Count nr of breaks\n", - "nr_of_colors <- length(labels)\n", - "\n", - "# nr_of_colors\n", - "palette_to_use <- get_range_from_count(nr_of_colors)\n", - "\n", - "# # Need to make palettes as named vectors so that scale_color_manual() and scale_fill_manual() can use them properly\n", - "# # Note: need to reverse order of labels to match the palette order \"meaning\" (red \"\" should correcpond to lowest value)\n", - "# names(palette_to_use) <- rev(labels)\n", - "\n", - "print(palette_to_use)\n" - ] - }, - { - "cell_type": "markdown", - "id": "d765607f", - "metadata": {}, - "source": [ - "#### Define plot size" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f9c3af7", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "options(repr.plot.width = 20, repr.plot.height = 12)" - ] - }, - { - "cell_type": "markdown", - "id": "1768642c-297e-4a9f-939d-4460c047761b", - "metadata": {}, - "source": [ - "## Coherence checks\n", - "\n", - "See Jira: https://bluesquare.atlassian.net/browse/SNT25-272" - ] - }, - { - "cell_type": "markdown", - "id": "ea339f92", - "metadata": {}, - "source": [ - "#### 1. TPR" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a4cb374", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Calculate yearly TPR to be added on top of the monthly TPR plots\n", - "monthly_cases_yearly <- monthly_cases %>%\n", - " group_by(ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR) %>% \n", - " mutate(\n", - " CONF_yearly = sum(CONF, na.rm = TRUE),\n", - " TEST_yearly = sum(TEST, na.rm = TRUE)\n", - " ) %>%\n", - " ungroup() %>%\n", - " mutate(\n", - " TPR_yearly = ifelse(!is.na(CONF_yearly) & !is.na(TEST_yearly) & (TEST_yearly != 0), CONF_yearly / TEST_yearly, 1)\n", - " ) \n", - "\n", - "head(monthly_cases_yearly)" - ] - }, - { - "cell_type": "markdown", - "id": "93ddc4fd", - "metadata": {}, - "source": [ - "##### 1.1. TPR (monthly) over time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d34d2d0", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "ggplot(monthly_cases_yearly) +\n", - "# Monthly TPR lines\n", - " geom_line(\n", - " aes(x = MONTH, y = TPR, group = ADM2_NAME),\n", - " color = \"grey21\",\n", - " alpha = 0.75) +\n", - " facet_grid(\n", - " cols = vars(YEAR), rows = vars(ADM1_NAME),\n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = seq(1,12,1)) +\n", - " scale_y_continuous(labels = scales::percent_format(accuracy = 1L), limits = c(0, 1)) +\n", - " geom_hline(\n", - " yintercept = 0,\n", - " color = \"grey21\",\n", - " linewidth = 0.5\n", - " ) +\n", - " labs(\n", - " title = \"Taux de Positivité des Tests (TPR) pour ADM2 et mois\" ) +\n", - " theme_minimal() +\n", - " theme(\n", - " panel.grid.minor = element_blank(),\n", - " panel.grid.major.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " strip.background = element_rect(fill = \"grey21\"),\n", - " strip.text = element_text(color = \"white\"),\n", - " axis.title.y = element_blank()\n", - " )\n", - "\n", - "ggsave(\n", - " file.path(FIGURES_PATH, glue::glue(\"TPR_monthly_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", - " create.dir = TRUE,\n", - " bg = \"white\",\n", - " units = \"cm\",\n", - " width = 21,\n", - " height = 29.7,\n", - " dpi = 200)" - ] - }, - { - "cell_type": "markdown", - "id": "cf87c6a4", - "metadata": {}, - "source": [ - "##### 1.2. TPR (monthly & yearly) over time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0004755f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "\n", - "# Add layer of yearly TPR on top (actually underneath) of monthly TPR\n", - "\n", - "ggplot(monthly_cases_yearly) +\n", - "# Yearly TPR lines\n", - " geom_line(\n", - " aes(x = MONTH, y = TPR_yearly, group = ADM2_NAME), \n", - " color = \"grey21\",\n", - " alpha = 0.25,\n", - " linewidth = 0.5) +\n", - "# Monthly TPR lines\n", - " geom_line(\n", - " aes(x = MONTH, y = TPR, group = ADM2_NAME),\n", - " color = \"grey21\",\n", - " alpha = 0.75) +\n", - " facet_grid(\n", - " cols = vars(YEAR), rows = vars(ADM1_NAME),\n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = seq(1,12,1)) +\n", - " scale_y_continuous(labels = scales::percent_format(accuracy = 1L), limits = c(0, 1)) +\n", - " geom_hline(\n", - " yintercept = 0,\n", - " color = \"grey21\",\n", - " linewidth = 0.5\n", - " ) +\n", - " labs(\n", - " title = \"Taux de Positivité des Tests (TPR) pour ADM2 at pour mois et année\",\n", - " subtitle = \"Les valeurs agrégées par année sont indiquées comme lignes horizontales.\") +\n", - " theme_minimal() +\n", - " theme(\n", - " panel.grid.minor = element_blank(),\n", - " panel.grid.major.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " strip.background = element_rect(fill = \"grey21\"),\n", - " strip.text = element_text(color = \"white\"),\n", - " axis.title.y = element_blank()\n", - " )\n", - "\n", - "ggsave(\n", - " file.path(FIGURES_PATH, glue::glue(\"TPR_monthly_yearly_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", - " create.dir = TRUE,\n", - " bg = \"white\",\n", - " units = \"cm\",\n", - " width = 21,\n", - " height = 29.7,\n", - " dpi = 200)" - ] - }, - { - "cell_type": "markdown", - "id": "577a047e", - "metadata": {}, - "source": [ - "#### 2. RR\n", - "For more detailas, check **report** notebooks for reporting rate of used method. Possible options:\n", - "* **Dataset**: pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs/**snt_dhis2_reporting_rate_dataset_report**\\_OUTPUT\\_\\*.ipynb\n", - "* **DataElement**: work in progress ...\n", - "\n", - "⚠️⚠️⚠️ **TO DO**: align code here with report notebook pf reporting rate (use \"🎨 NEW dynamic colors & breaks\" approach) ⚠️⚠️⚠️" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "780802fd", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Tile plot faceted by YEAR\n", - "ggplot(data = monthly_cases) +\n", - " geom_tile(aes(x = MONTH,\n", - " y = forcats::fct_rev(ADM2_NAME),\n", - " # fill = REPORTING_RATE_CATEGORY\n", - " fill = REPORTING_RATE\n", - " ), \n", - " color = \"white\",\n", - " show.legend = TRUE,\n", - " # Fill NA values with white\n", - " na.rm = FALSE\n", - " ) +\n", - "# scale_fill_manual(\n", - "# values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - "# na.value = \"white\",\n", - "# name = \"Reporting Rate: \"\n", - "# ) +\n", - " scale_fill_viridis_c(\n", - " option = \"viridis\",\n", - " na.value = \"white\",\n", - " name = \"Reporting Rate:\",\n", - " direction = -1\n", - " # labels = scales::percent_format(accuracy = 1L)\n", - " ) +\n", - " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", - " facet_grid(rows = vars(ADM1_NAME), cols = vars(YEAR), \n", - " scales = \"free_y\", space = \"free_y\",\n", - " switch = \"y\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", - " legend.position = \"bottom\",\n", - " legend.key.height = unit(0.25, \"cm\"),\n", - " axis.text.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " panel.grid.minor = element_blank(),\n", - " panel.grid.major = element_blank(),\n", - " strip.placement = \"outside\", \n", - " strip.text = element_text(color = \"white\", face = \"bold\", size = 10),\n", - " strip.background = element_rect(fill = \"grey21\")\n", - " ) +\n", - " guides(fill = guide_legend(nrow = 1))\n", - "\n", - "# Export plot as png\n", - "ggsave(\n", - " file.path(FIGURES_PATH, glue::glue(\"ReportingRate_heatmap_monthly_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", - " create.dir = TRUE,\n", - " bg = \"white\",\n", - " units = \"cm\",\n", - " width = 21,\n", - " height = 29.7,\n", - " dpi = 200)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "693ae7a4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check on data completeness for REPORTING RATE data: \n", - "# check how many values of REPORTING_RATE are NA\n", - "na_count <- sum(is.na(monthly_cases$REPORTING_RATE)) \n", - "if (na_count > 0) {\n", - " log_msg(glue(\"⚠️ Warning: Reporting Rate data contains {na_count} missing values (NA) in 'REPORTING_RATE' column.\"), \"warning\")\n", - "} else {\n", - " log_msg(\"✅ Reporting Rate data contains no missing values (NA) in 'REPORTING_RATE' column.\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "0dfbbd7a", - "metadata": {}, - "source": [ - "### 3. Coherence checks on Incidence: Scatter plots\n", - "\n", - "Logic: each level of adjustment should produce values that are greater (or equal) to the previous level.
\n", - "\n", - "Namely:\n", - "* Crude <= Adj1\n", - "* Adj1 <= Adj2\n", - "* Adj2 <= Adj3\n", - "\n", - "Given than Crude, Adj1, Adj2, and Adj3 are calculated by aggregating `CONF`, `N1`, `N2`, and `N3` at ADM2 x YEAR, we can first verify that the relationship between these values is coherent. Namely, check if\n", - "* `CONF` <= `N1`\n", - "* `N1` <= `N2`\n", - "* `N2` <= `N3` " - ] - }, - { - "cell_type": "markdown", - "id": "26460827", - "metadata": {}, - "source": [ - "#### 3.1. Incidence \"metrics\"\n", - "Metrics used to calculate incidence: `CONF`, `N1`, `N2`, (and `N3`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "82f9a64b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# CONF vs N1 \n", - "\n", - "# Create warning message if there are CONF values greater than N1\n", - "conf_greater_n1_count <- sum(monthly_cases$CONF > monthly_cases$N1, na.rm = TRUE)\n", - "if (conf_greater_n1_count > 0) {\n", - " warning_text <- glue(\"✘ Warning: There are {conf_greater_n1_count} instances where CONF is greater than N1.\", \"warning\")\n", - "} else {\n", - " warning_text <- \"✔ All CONF values are less than or equal to N1.\"\n", - "}\n", - "\n", - "ggplot(data = monthly_cases) +\n", - " geom_abline(intercept = 0, slope = 1, linetype = \"dashed\", color = \"red\") +\n", - " geom_point(\n", - " aes(\n", - " x = N1,\n", - " y = CONF),\n", - " alpha = 0.5) +\n", - " labs(\n", - " title = \"CONF vs N1\",\n", - " subtitle = \"N1 is expected to be greater or equal to CONF\",\n", - " caption = warning_text\n", - " ) +\n", - " theme_minimal() +\n", - " theme(\n", - " aspect.ratio = 1,\n", - " plot.caption.position = \"plot\",\n", - " plot.caption = element_text(hjust = 0)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7044d1d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# N1 > N2\n", - "\n", - "# Create warning message if there are N1 values greater than N2\n", - "n1_greater_n2_count <- sum(monthly_cases$N1 > monthly_cases$N2, na.rm = TRUE)\n", - "if (n1_greater_n2_count > 0) {\n", - " warning_text <- glue(\"✘ Warning: There are {n1_greater_n2_count} instances where N1 is greater than N2.\", \"warning\")\n", - "} else {\n", - " warning_text <- \"✔ All N1 values are less than or equal to N2.\"\n", - "}\n", - "\n", - "ggplot(data = monthly_cases) +\n", - " geom_abline(intercept = 0, slope = 1, linetype = \"dashed\", color = \"red\") +\n", - " geom_point(\n", - " aes(\n", - " x = N2,\n", - " y = N1),\n", - " alpha = 0.5) +\n", - " labs(title = \"N1 vs N2\",\n", - " subtitle = \"N2 is expected to be greater or equal to N1.\",\n", - " caption = warning_text\n", - " ) +\n", - " theme_minimal() +\n", - " theme(\n", - " aspect.ratio = 1,\n", - " plot.caption.position = \"plot\",\n", - " plot.caption = element_text(hjust = 0)\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "00fc14f7", - "metadata": {}, - "source": [ - "#### 3.2. Incidence values\n", - "Actual (calculated) incidence: Crude, Adj1, Adj2, Adj3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "caff3b82", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Add col to mark cases where INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE so that it is displayed in red in the plot\n", - "yearly_incidence_plot <- yearly_incidence %>%\n", - " mutate(\n", - " FLAG_CRUDE_VS_ADJTEST = ifelse(INCIDENCE_ADJ_TESTING < INCIDENCE_CRUDE, TRUE, FALSE),\n", - " FLAG_ADJTEST_VS_ADJREP = ifelse(INCIDENCE_ADJ_REPORTING < INCIDENCE_ADJ_TESTING, TRUE, FALSE)\n", - " )\n", - "\n", - "if (\"INCIDENCE_ADJ_CARESEEKING\" %in% colnames(yearly_incidence) && any(!is.na(yearly_incidence$INCIDENCE_ADJ_CARESEEKING))) {\n", - " # Create col to flag cases where INCIDENCE_ADJ_TESTING > INCIDENCE_ADJ_CARESEEKING\n", - " yearly_incidence_plot <- yearly_incidence_plot %>%\n", - " mutate(\n", - " FLAG_ADJTEST_VS_ADJCARE = ifelse(INCIDENCE_ADJ_TESTING > INCIDENCE_ADJ_CARESEEKING, TRUE, FALSE)\n", - " )\n", - "}\n", - "\n", - "head(yearly_incidence_plot)" - ] - }, - { - "cell_type": "markdown", - "id": "78860883", - "metadata": {}, - "source": [ - "##### Crude vs Adj for Testing (Adj1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7847b5e7", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create warning message if there are INCIDENCE_CRUDE values greater than INCIDENCE_ADJ_TESTING\n", - "incidence_crude_greater_adj1_count <- sum(yearly_incidence_plot$FLAG_CRUDE_VS_ADJTEST, na.rm = TRUE) \n", - "if (incidence_crude_greater_adj1_count > 0) {\n", - " warning_text <- glue(\"✘ Attention: il y a {incidence_crude_greater_adj1_count} instances où INCIDENCE_CRUDE est supérieure à INCIDENCE_ADJ_TESTING.\", \"warning\")\n", - "} else {\n", - " warning_text <- \"✔ Toutes les valeurs INCIDENCE_CRUDE sont inférieures ou égales à INCIDENCE_ADJ_TESTING.\"\n", - "}\n", - "\n", - "# Plot with points colored based on FLAG_CRUDE_VS_ADJTEST and faceted by YEAR\n", - "ggplot(data = yearly_incidence_plot) +\n", - " geom_abline(intercept = 0, slope = 1, linetype = \"dashed\", color = \"black\") +\n", - " geom_point(\n", - " aes(\n", - " x = INCIDENCE_CRUDE,\n", - " y = INCIDENCE_ADJ_TESTING,\n", - " color = FLAG_CRUDE_VS_ADJTEST),\n", - " alpha = 0.7,\n", - " size = 2) +\n", - " scale_color_manual(\n", - " values = c(\"FALSE\" = \"black\", \"TRUE\" = \"red\")\n", - " ) +\n", - " scale_x_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", - " scale_y_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", - " facet_wrap(vars(YEAR), nrow = 1) +\n", - " labs(\n", - " title = \"INCIDENCE_CRUDE vs INCIDENCE_ADJ_TESTING\",\n", - " subtitle = warning_text,\n", - " caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nMéthode de détection des valeurs aberrantes: {OUTLIER_DETECTION_METHOD}.\\nTaux de déclaration calculé selon la méthode : {REPORTING_RATE_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", - " ) +\n", - " theme_minimal() +\n", - " theme(\n", - " aspect.ratio = 1,\n", - " legend.position = \"none\",\n", - " strip.text = element_text(face = \"bold\", size = 10),\n", - " panel.grid.minor = element_blank(),\n", - " plot.caption = element_text(size = 7, hjust = 0)\n", - " )\n", - "\n", - "# Export plots as png\n", - "ggsave(\n", - " file.path(FIGURES_PATH, glue::glue(\"Incidence_year_crude_vs_adj_testing_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", - " create.dir = TRUE,\n", - " bg = \"white\",\n", - " units = \"cm\",\n", - " width = 25,\n", - " height = 12.5,\n", - " dpi = 200)" - ] - }, - { - "cell_type": "markdown", - "id": "53a268f8", - "metadata": {}, - "source": [ - "##### Adj for Testing (Adj1) vs Adj for Reporting (Adj2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b25c459", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create warning message if there are INCIDENCE_ADJ_TESTING values greater than INCIDENCE_ADJ_REPORTING\n", - "incidence_adj1_greater_adj2_count <- sum(yearly_incidence_plot$FLAG_ADJTEST_VS_ADJREP, na.rm = TRUE) \n", - "if (incidence_adj1_greater_adj2_count > 0) {\n", - " warning_text <- glue(\"✘ Attention: il y a {incidence_adj1_greater_adj2_count} instances où INCIDENCE_ADJ_TESTING est supérieure à INCIDENCE_ADJ_REPORTING.\", \"warning\")\n", - "} else {\n", - " warning_text <- \"✔ Toutes les valeurs INCIDENCE_ADJ_TESTING sont inférieures ou égales à INCIDENCE_ADJ_REPORTING.\"\n", - "}\n", - "\n", - "# Plot with points colored based on FLAG_ADJTEST_VS_ADJREP and faceted by YEAR\n", - "ggplot(data = yearly_incidence_plot) +\n", - " geom_abline(intercept = 0, slope = 1, linetype = \"dashed\", color = \"black\") +\n", - " geom_point(\n", - " aes(\n", - " x = INCIDENCE_ADJ_TESTING,\n", - " y = INCIDENCE_ADJ_REPORTING,\n", - " color = FLAG_ADJTEST_VS_ADJREP),\n", - " alpha = 0.7,\n", - " size = 2) +\n", - " scale_color_manual(\n", - " values = c(\"FALSE\" = \"black\", \"TRUE\" = \"red\")\n", - " ) +\n", - " scale_x_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", - " scale_y_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", - " facet_wrap(vars(YEAR), nrow = 1) +\n", - " labs(\n", - " title = \"INCIDENCE_ADJ_TESTING vs INCIDENCE_ADJ_REPORTING\",\n", - " subtitle = warning_text,\n", - " caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nMéthode de détection des valeurs aberrantes: {OUTLIER_DETECTION_METHOD}.\\nTaux de déclaration calculé selon la méthode : {REPORTING_RATE_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", - " ) +\n", - " theme_minimal() +\n", - " theme(\n", - " aspect.ratio = 1,\n", - " legend.position = \"none\",\n", - " strip.text = element_text(face = \"bold\", size = 10),\n", - " panel.grid.minor = element_blank(),\n", - " plot.caption = element_text(size = 7, hjust = 0)\n", - " )\n", - "\n", - "# Export plots as png\n", - "ggsave(\n", - " file.path(FIGURES_PATH, glue::glue(\"Incidence_year_adj_testing_vs_adj_reporting_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", - " create.dir = TRUE,\n", - " bg = \"white\",\n", - " units = \"cm\",\n", - " width = 25,\n", - " height = 12.5,\n", - " dpi = 200)" - ] - }, - { - "cell_type": "markdown", - "id": "b7c80df9", - "metadata": {}, - "source": [ - "##### Adj for Reporting (Adj2) vs Adj for Care Seeking Behaviour (Adj3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35f9e871", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (\"INCIDENCE_ADJ_CARESEEKING\" %in% colnames(yearly_incidence) && any(!is.na(yearly_incidence$INCIDENCE_ADJ_CARESEEKING))) {\n", - "\n", - " # Create warning message if there are INCIDENCE_ADJ_TESTING values greater than INCIDENCE_ADJ_CARESEEKING\n", - " incidence_adj2_greater_adj3_count <- sum(yearly_incidence$INCIDENCE_ADJ_TESTING > yearly_incidence$INCIDENCE_ADJ_CARESEEKING, na.rm = TRUE) \n", - " if (incidence_adj2_greater_adj3_count > 0) {\n", - " warning_text <- glue(\"✘ Attention: il y a {incidence_adj2_greater_adj3_count} instances où INCIDENCE_ADJ_TESTING est supérieure à INCIDENCE_ADJ_CARESEEKING.\", \"warning\")\n", - " } else {\n", - " warning_text <- \"✔ Toutes les valeurs INCIDENCE_ADJ_TESTING sont inférieures ou égales à INCIDENCE_ADJ_CARESEEKING.\"\n", - " }\n", - "\n", - " # Plot with points colored based on FLAG_ADJTEST_VS_ADJREP and faceted by YEAR\n", - " p <- ggplot(data = yearly_incidence_plot) +\n", - " geom_abline(intercept = 0, slope = 1, linetype = \"dashed\", color = \"black\") +\n", - " geom_point(\n", - " aes(\n", - " x = INCIDENCE_ADJ_TESTING,\n", - " y = INCIDENCE_ADJ_CARESEEKING,\n", - " color = FLAG_ADJTEST_VS_ADJCARE),\n", - " alpha = 0.7,\n", - " size = 2) +\n", - " scale_color_manual(\n", - " values = c(\"FALSE\" = \"black\", \"TRUE\" = \"red\")\n", - " ) +\n", - " scale_x_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", - " scale_y_continuous(limits = c(0, NA), breaks = c(0, break_vals)) +\n", - " facet_wrap(vars(YEAR), nrow = 1) +\n", - " labs(\n", - " title = \"INCIDENCE_ADJ_TESTING vs INCIDENCE_ADJ_CARESEEKING\",\n", - " subtitle = warning_text,\n", - " caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nMéthode de détection des valeurs aberrantes: {OUTLIER_DETECTION_METHOD}.\\nTaux de déclaration calculé selon la méthode : {REPORTING_RATE_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", - " ) +\n", - " theme_minimal() +\n", - " theme(\n", - " aspect.ratio = 1,\n", - " legend.position = \"none\",\n", - " strip.text = element_text(face = \"bold\", size = 10),\n", - " panel.grid.minor = element_blank(),\n", - " plot.caption = element_text(size = 7, hjust = 0)\n", - " )\n", - "\n", - " print(p)\n", - " \n", - " # Export plots as png\n", - " ggsave(\n", - " file.path(FIGURES_PATH, glue::glue(\"Incidence_year_adj_testing_vs_adj_careseeking_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", - " create.dir = TRUE,\n", - " bg = \"white\",\n", - " units = \"cm\",\n", - " width = 25,\n", - " height = 12.5,\n", - " dpi = 200)\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e828c4c0", - "metadata": {}, - "source": [ - "## Incidence du paludisme par année par district sanitaire" - ] - }, - { - "cell_type": "markdown", - "id": "a6ac4fbb", - "metadata": {}, - "source": [ - "#### Puor annee et niveau d'ajustement" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3effcced", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: Prepare long-form data\n", - "incidence_long <- yearly_incidence %>% # incidence_data\n", - " select(ADM2_ID, YEAR, POPULATION,\n", - " INCIDENCE_CRUDE,\n", - " INCIDENCE_ADJ_TESTING,\n", - " INCIDENCE_ADJ_REPORTING,\n", - " INCIDENCE_ADJ_CARESEEKING) %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"INCIDENCE\"),\n", - " names_to = \"INCIDENCE_TYPE\",\n", - " values_to = \"incidence\"\n", - " ) %>%\n", - " mutate(\n", - " incidence_type_label = case_when(\n", - " INCIDENCE_TYPE == \"INCIDENCE_CRUDE\" ~ \"Brute\",\n", - " INCIDENCE_TYPE == \"INCIDENCE_ADJ_TESTING\" ~ \"Ajustée 1\\n(Test)\",\n", - " INCIDENCE_TYPE == \"INCIDENCE_ADJ_REPORTING\" ~ \"Ajustée 2\\n(Test + Complétude)\",\n", - " INCIDENCE_TYPE == \"INCIDENCE_ADJ_CARESEEKING\" ~ \"Ajustée 3\\n(Test + Complétude + Soins)\",\n", - " TRUE ~ INCIDENCE_TYPE\n", - " )\n", - " )\n", - "\n", - "# Reorder incidence_type_label for plotting\n", - "incidence_long$incidence_type_label <- factor(\n", - "incidence_long$incidence_type_label,\n", - "levels = c(\"Brute\", \"Ajustée 1\\n(Test)\", \"Ajustée 2\\n(Test + Complétude)\", \"Ajustée 3\\n(Test + Complétude + Soins)\")\n", - ")\n", - "# # Remove INCIDENCE_ADJ_CARESEEKING if this is all empty ...\n", - "# filter(!is.na(incidence))\n", - "\n", - "\n", - "# Step 2: Join with shapefile\n", - "map_data_long <- shapes_data %>%\n", - " left_join(incidence_long, by = \"ADM2_ID\")\n", - "\n", - "\n", - "# Step 3: categorize incidence based on break values from metadata\n", - "map_data_long <- map_data_long %>%\n", - " mutate(\n", - " INCIDENCE_CATEGORY = cut(\n", - " incidence,\n", - " breaks = full_breaks,\n", - " labels = labels,\n", - " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", - " include.lowest = TRUE\n", - " )\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1c3619af", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "options(repr.plot.width = 20, repr.plot.height = 12)\n", - "\n", - "# Dynamically define subtitle text (handle `is.null(DISAGGREGATION_SELECTION)` so it disaplys TOTAL instead)\n", - "if (is.null(DISAGGREGATION_SELECTION)) { \n", - " subtitle_text <- \"Brute et ajustée selon les étapes OMS.\\nAucune désagrégation spécifique sélectionnée.\"\n", - "} else {\n", - " subtitle_text <- glue::glue(\"Brute et ajustée selon les étapes OMS.\\nDésagrégation utilisée: {DISAGGREGATION_SELECTION}.\")\n", - "}\n", - "\n", - "# Plot maps faceted by incidence type and year\n", - "ggplot(map_data_long) +\n", - " geom_sf(aes(fill = INCIDENCE_CATEGORY), color = \"white\", size = 0.2) +\n", - " facet_grid(\n", - " rows = vars(incidence_type_label),\n", - " cols = vars(YEAR)\n", - " ) +\n", - " scale_fill_manual(values = palette_to_use, name = \"Incidence (pour 1000)\") +\n", - " labs(\n", - " title = \"Incidence annuelle du paludisme par district sanitaire\",\n", - " subtitle = subtitle_text,\n", - " # caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nDétection des valeurs aberrantes: {OUTLIER_DETECTION_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", - " caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nMéthode de détection des valeurs aberrantes: {OUTLIER_DETECTION_METHOD}.\\nTaux de déclaration calculé selon la méthode : {REPORTING_RATE_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " strip.text = element_text(face = \"bold\", size = 12),\n", - " plot.title = element_text(face = \"bold\", size = 16),\n", - " plot.subtitle = element_text(size = 13),\n", - " plot.caption = element_text(size = 7, hjust = 0),\n", - " legend.position = \"right\",\n", - " legend.justification = \"top\",\n", - " panel.grid.major = element_blank(),\n", - " panel.grid.minor = element_blank(),\n", - " axis.text = element_blank(),\n", - " axis.ticks = element_blank(),\n", - " )\n", - "\n", - "ggsave(\n", - " file.path(FIGURES_PATH, \n", - " glue::glue(\"Incidence_faceted_year_adjustment_{DISAGGREGATION_SELECTION_SUFFIX}.png\")),\n", - " create.dir = TRUE,\n", - " units = \"cm\",\n", - " width = 31,\n", - " height = 31,\n", - " dpi = 200\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "902b126a", - "metadata": {}, - "source": [ - "#### Moyenne annuelle (toutes années confondues)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6acf093", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Summarize incidence_long by computing mean incidence per INCIDENCE_TYPE and ADM2_ID across all years\n", - "incidence_long_mean <- incidence_long %>% \n", - "select(-POPULATION) %>%\n", - "# Added 20260128\n", - "group_by(ADM2_ID, INCIDENCE_TYPE,\tincidence_type_label) |>\n", - "summarise(\n", - " across(starts_with(\"INCIDENCE\"), ~mean(., na.rm = TRUE)), # 🔍 pox PROBLEM here: if missing data for RR -> sum of N2 by YEAR is smaller than the sum of N1 !\n", - " .groups = \"drop\"\n", - " ) \n", - "\n", - "# Step 2: Join with shapefile\n", - "map_data_long_mean <- shapes_data %>%\n", - " left_join(incidence_long_mean, by = \"ADM2_ID\")\n", - "\n", - "\n", - "# Step 3: categorize incidence based on break values from metadata\n", - "map_data_long_mean <- map_data_long_mean %>%\n", - " mutate(\n", - " INCIDENCE_CATEGORY = cut(\n", - " incidence,\n", - " breaks = full_breaks,\n", - " labels = labels,\n", - " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", - " include.lowest = TRUE\n", - " )\n", - " )\n", - "\n", - "# head(map_data_long_mean)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e9ddfcc", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "subtitle_text_mean <- if (is.null(DISAGGREGATION_SELECTION)) {\n", - " \"Moyenne annuelle (toutes années confondues).\\nAucune désagrégation spécifique sélectionnée.\"\n", - "} else {\n", - " glue::glue(\"Moyenne annuelle (toutes années confondues).\\nDésagrégation utilisée: {DISAGGREGATION_SELECTION}.\")\n", - "}\n", - "\n", - "# Plot maps faceted by incidence type\n", - "ggplot(map_data_long_mean) +\n", - " geom_sf(aes(fill = INCIDENCE_CATEGORY), color = \"white\", size = 0.2) +\n", - " facet_wrap(\n", - " ~incidence_type_label,\n", - " nrow = 1\n", - " ) +\n", - " scale_fill_manual(values = palette_to_use, name = \"Incidence (pour 1000)\") +\n", - " labs(\n", - " title = \"Incidence moyenne du paludisme par district sanitaire\",\n", - " subtitle = subtitle_text_mean,\n", - " caption = glue::glue(\"Méthode de calcul de N1: {N1_METHOD}.\\nUtilisation de la population ajustée: {USE_ADJUSTED_POPULATION}.\\nDonnées de routine: {ROUTINE_DATA_CHOICE}.\\nMéthode de détection des valeurs aberrantes: {OUTLIER_DETECTION_METHOD}.\\nTaux de déclaration calculé selon la méthode : {REPORTING_RATE_METHOD}.\\nUtilisation des données CSB: {USE_CSB_DATA}.\")\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " strip.text = element_text(face = \"bold\", size = 12),\n", - " plot.title = element_text(face = \"bold\", size = 16),\n", - " plot.subtitle = element_text(size = 13),\n", - " plot.caption = element_text(size = 7, hjust = 0),\n", - " legend.position = \"right\",\n", - " legend.justification = \"top\",\n", - " panel.grid.major = element_blank(),\n", - " panel.grid.minor = element_blank(),\n", - " axis.text = element_blank(),\n", - " axis.ticks = element_blank(),\n", - " )\n", - "\n", - "\n", - "# Export plots as png\n", - "YEAR_RANGE <- paste0(min(yearly_incidence$YEAR), \"-\", max(yearly_incidence$YEAR))\n", - "ggsave(\n", - " file.path(FIGURES_PATH, \n", - " glue::glue(\"Incidence_faceted_adjustment_{DISAGGREGATION_SELECTION_SUFFIX}_mean-{YEAR_RANGE}.png\")),\n", - " create.dir = TRUE,\n", - " bg = \"white\",\n", - " units = \"cm\",\n", - " width = 41,\n", - " height = 21,\n", - " dpi = 200\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index a07e091..931e223 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -1,1234 +1,1232 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2026-01-16T10:22:53.011120", - "exception": false, - "start_time": "2026-01-16T10:22:53.010947", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# Data Element reporting rate: based on reporting of one or more indicators\n", - "Partially following methods by WHO and as per Diallo (2025) paper\n", - "\n", - "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", - "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", - "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", - "\n", - "Specifically: \n", - "\n", - "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", - " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", - "
\n", - "
\n", - "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", - " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", - " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", - " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", - "
\n", - "
\n", - "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", - " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", - " * Filename: `XXX_reporting_rate_dataelement.`" - ], - "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65" - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000228, - "end_time": "2026-01-16T10:22:53.014752", - "exception": false, - "start_time": "2026-01-16T10:22:53.014524", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ], - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a" - }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 63.150489, - "end_time": "2026-01-16T10:23:56.165530", - "exception": false, - "start_time": "2026-01-16T10:22:53.015041", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ], - "execution_count": null, - "outputs": [], - "id": "35ede7cf-257f-439c-a514-26a7290f881d" - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.00011, - "end_time": "2026-01-16T10:23:56.165873", - "exception": false, - "start_time": "2026-01-16T10:23:56.165763", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 1.1. Fallback parameters values\n", - "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", - "The code cell below here provides fallback paramater values needed when running this notebook locally." - ], - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e" - }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.033954, - "end_time": "2026-01-16T10:23:56.199937", - "exception": false, - "start_time": "2026-01-16T10:23:56.165983", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Current options: \n", - "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", - "# \"COUNTRY_CODE_routine_outliers-mean_removed.parquet\" \n", - "# \"COUNTRY_CODE_routine_outliers-mean_imputed.parquet\"\n", - "# \"COUNTRY_CODE_routine_outliers-median_removed.parquet\"\n", - "# \"COUNTRY_CODE_routine_outliers-median_imputed.parquet\" \n", - "# \"COUNTRY_CODE_routine_outliers-iqr_removed.parquet\"\n", - "# \"COUNTRY_CODE_routine_outliers-iqr_imputed.parquet\"\n", - "# \"COUNTRY_CODE_routine_outliers-trend_removed.parquet\"\n", - "# \"COUNTRY_CODE_routine_outliers-trend_imputed.parquet\" \n", - "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"NER_routine_outliers-mean_imputed.parquet\"}\n", - "\n", - "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", - "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", - "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", - "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" - ], - "execution_count": null, - "outputs": [], - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb" - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000095, - "end_time": "2026-01-16T10:23:56.200231", - "exception": false, - "start_time": "2026-01-16T10:23:56.200136", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 1.2. Load and check `snt config` file" - ], - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be" - }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.521572, - "end_time": "2026-01-16T10:23:56.721932", - "exception": false, - "start_time": "2026-01-16T10:23:56.200360", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ], - "execution_count": null, - "outputs": [], - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f" - }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.033003, - "end_time": "2026-01-16T10:23:56.755117", - "exception": false, - "start_time": "2026-01-16T10:23:56.722114", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", - "# 🚨 NOTE (2025-01-09): The configuration field `NA_TREATMENT` has been removed from SNT_config.json files.\n", - "# It was legacy code from Ousmane and was only used for Reporting Rate calculations (not anymore).\n", - "# It has been replaced by `0_VALUES_PRESERVED` (boolean: true/false) which specifies whether zero values\n", - "# are stored in the DHIS2 instance (true) or converted to NULL to save space (false).\n", - "# See: https://bluesquare.atlassian.net/browse/SNT25-158\n", - "# The variable `NA_TREATMENT` is kept here for backward compatibility but is no longer loaded from config.\n", - "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", - "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) \n", - "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", - "\n", - "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", - "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", - "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ], - "execution_count": null, - "outputs": [], - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ], - "id": "8bf4a8bb" - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "if (!length(ACTIVITY_INDICATORS) > 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ], - "id": "18b40207", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000093, - "end_time": "2026-01-16T10:23:56.779812", - "exception": false, - "start_time": "2026-01-16T10:23:56.779719", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ], - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b" - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000069, - "end_time": "2026-01-16T10:23:56.779987", - "exception": false, - "start_time": "2026-01-16T10:23:56.779918", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.1. Routine data (DHIS2) \n", - "**Note on pipeline behaviour**:
\n", - "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." - ], - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532" + "cells": [ + { + "cell_type": "markdown", + "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2026-01-16T10:22:53.011120", + "exception": false, + "start_time": "2026-01-16T10:22:53.010947", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 2.018878, - "end_time": "2026-01-16T10:23:58.798963", - "exception": false, - "start_time": "2026-01-16T10:23:56.780085", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# select dataset\n", - "if (ROUTINE_FILE == glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "} else {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - "}\n", - " \n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", - "\n", - "# log\n", - "log_msg(glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset: {rountine_dataset_name}. Dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)" - ], - "execution_count": null, - "outputs": [], - "id": "a1213723-f7e2-4238-9f37-f1795b187232" + "tags": [] + }, + "source": [ + "# Data Element reporting rate: based on reporting of one or more indicators\n", + "Partially following methods by WHO and as per Diallo (2025) paper\n", + "\n", + "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", + "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", + "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", + "\n", + "Specifically: \n", + "\n", + "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", + " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", + "
\n", + "
\n", + "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", + " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", + " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", + " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", + "
\n", + "
\n", + "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", + " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", + " * Filename: `XXX_reporting_rate_dataelement.`" + ] + }, + { + "cell_type": "markdown", + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", + "metadata": { + "papermill": { + "duration": 0.000228, + "end_time": "2026-01-16T10:22:53.014752", + "exception": false, + "start_time": "2026-01-16T10:22:53.014524", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000138, - "end_time": "2026-01-16T10:23:58.799287", - "exception": false, - "start_time": "2026-01-16T10:23:58.799149", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.2. Organisation units (DHIS2 pyramid)" - ], - "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a" + "tags": [] + }, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35ede7cf-257f-439c-a514-26a7290f881d", + "metadata": { + "papermill": { + "duration": 63.150489, + "end_time": "2026-01-16T10:23:56.165530", + "exception": false, + "start_time": "2026-01-16T10:22:53.015041", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.992899, - "end_time": "2026-01-16T10:23:59.792385", - "exception": false, - "start_time": "2026-01-16T10:23:58.799486", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Load file from dataset\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - " \n", - "msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset: `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", - "log_msg(msg)\n", - "dim(dhis2_pyramid_formatted)\n", - "head(dhis2_pyramid_formatted,2)" - ], - "execution_count": null, - "outputs": [], - "id": "2fd92901-901e-4019-be78-a7718050c1c4" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Project paths\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "\n", + "# Load libraries \n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Environment variables\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load OpenHEXA sdk\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "markdown", + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", + "metadata": { + "papermill": { + "duration": 0.00011, + "end_time": "2026-01-16T10:23:56.165873", + "exception": false, + "start_time": "2026-01-16T10:23:56.165763", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000106, - "end_time": "2026-01-16T10:23:59.792710", - "exception": false, - "start_time": "2026-01-16T10:23:59.792604", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.3. Check whether selected indicators are present in routine data\n", - "Extra precaution measure to avoid breaks downstream.
\n", - "\n", - "Note: This logic should be moved to pipeline.py 🐍" - ], - "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1" + "tags": [] + }, + "source": [ + "### 1.1. Fallback parameters values\n", + "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", + "The code cell below here provides fallback paramater values needed when running this notebook locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", + "metadata": { + "papermill": { + "duration": 0.033954, + "end_time": "2026-01-16T10:23:56.199937", + "exception": false, + "start_time": "2026-01-16T10:23:56.165983", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.024863, - "end_time": "2026-01-16T10:23:59.817677", - "exception": false, - "start_time": "2026-01-16T10:23:59.792814", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", - "}\n", - "\n", - "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", - " cat(msg)\n", - " stop(msg)\n", - "}" - ], - "execution_count": null, - "outputs": [], - "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Current options:\n", + "# \"COUNTRY_CODE_routine.parquet\" (raw)\n", + "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", + "# \"COUNTRY_CODE_routine_outliers_removed.parquet\"\n", + "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"NER_routine_outliers_imputed.parquet\"}\n", + "\n", + "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", + "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", + "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", + "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", + "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" + ] + }, + { + "cell_type": "markdown", + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", + "metadata": { + "papermill": { + "duration": 9.5e-05, + "end_time": "2026-01-16T10:23:56.200231", + "exception": false, + "start_time": "2026-01-16T10:23:56.200136", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000091, - "end_time": "2026-01-16T10:23:59.817949", - "exception": false, - "start_time": "2026-01-16T10:23:59.817858", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 3. Reporting rates computations" - ], - "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295" + "tags": [] + }, + "source": [ + "### 1.2. Load and check `snt config` file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", + "metadata": { + "papermill": { + "duration": 0.521572, + "end_time": "2026-01-16T10:23:56.721932", + "exception": false, + "start_time": "2026-01-16T10:23:56.200360", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3.0. Define start and end period based on routine data " - ], - "id": "7d62cdb6" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", + "metadata": { + "papermill": { + "duration": 0.033003, + "end_time": "2026-01-16T10:23:56.755117", + "exception": false, + "start_time": "2026-01-16T10:23:56.722114", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.044172, - "end_time": "2026-01-16T10:23:59.862224", - "exception": false, - "start_time": "2026-01-16T10:23:59.818052", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", - "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", - "\n", - "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", - "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", + "# 🚨 NOTE (2025-01-09): The configuration field `NA_TREATMENT` has been removed from SNT_config.json files.\n", + "# It was legacy code from Ousmane and was only used for Reporting Rate calculations (not anymore).\n", + "# It has been replaced by `0_VALUES_PRESERVED` (boolean: true/false) which specifies whether zero values\n", + "# are stored in the DHIS2 instance (true) or converted to NULL to save space (false).\n", + "# See: https://bluesquare.atlassian.net/browse/SNT25-158\n", + "# The variable `NA_TREATMENT` is kept here for backward compatibility but is no longer loaded from config.\n", + "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", + "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) \n", + "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", + "\n", + "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", + "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", + "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" + ] + }, + { + "cell_type": "markdown", + "id": "8bf4a8bb", + "metadata": {}, + "source": [ + "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", + "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", + "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18b40207", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!length(ACTIVITY_INDICATORS) > 0) {\n", + " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", + " cat(msg) \n", + " stop(msg)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", + "metadata": { + "papermill": { + "duration": 9.3e-05, + "end_time": "2026-01-16T10:23:56.779812", + "exception": false, + "start_time": "2026-01-16T10:23:56.779719", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:23:59.862555", - "exception": false, - "start_time": "2026-01-16T10:23:59.862446", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.1. Build master table (all PERIOD x OU)\n", - "The master table contains all combinations of period x organisation unit " - ], - "id": "526bc3af-01c1-4ddc-b3b9-077354e57559" + "tags": [] + }, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", + "metadata": { + "papermill": { + "duration": 6.9e-05, + "end_time": "2026-01-16T10:23:56.779987", + "exception": false, + "start_time": "2026-01-16T10:23:56.779918", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.289128, - "end_time": "2026-01-16T10:24:00.151791", - "exception": false, - "start_time": "2026-01-16T10:23:59.862663", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", - "\n", - "facility_master <- dhis2_pyramid_formatted %>%\n", - " rename(\n", - " OU_ID = glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\"),\n", - " OU_NAME = glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_NAME\"),\n", - " ADM2_ID = str_replace(ADMIN_2, \"NAME\", \"ID\"),\n", - " ADM2_NAME = all_of(ADMIN_2),\n", - " ADM1_ID = str_replace(ADMIN_1, \"NAME\", \"ID\"),\n", - " ADM1_NAME = all_of(ADMIN_1)\n", - " ) %>%\n", - " select(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME, OPENING_DATE, CLOSED_DATE) %>%\n", - " distinct() %>%\n", - " tidyr::crossing(PERIOD = period_vector) %>%\n", - " mutate(PERIOD=as.numeric(PERIOD))\n", - " " - ], - "execution_count": null, - "outputs": [], - "id": "9308197a-0852-4d34-8888-cf5564f35a9d" + "tags": [] + }, + "source": [ + "### 2.1. Routine data (DHIS2) \n", + "**Note on pipeline behaviour**:
\n", + "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1213723-f7e2-4238-9f37-f1795b187232", + "metadata": { + "papermill": { + "duration": 2.018878, + "end_time": "2026-01-16T10:23:58.798963", + "exception": false, + "start_time": "2026-01-16T10:23:56.780085", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000114, - "end_time": "2026-01-16T10:24:00.152094", - "exception": false, - "start_time": "2026-01-16T10:24:00.151980", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.2. Identify \"Active\" facilities\n", - "\n", - "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" - ], - "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# select dataset\n", + "if (ROUTINE_FILE == glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", + " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "} else {\n", + " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", + "}\n", + " \n", + "# Load file from dataset\n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", + " error = function(e) {\n", + " msg <- paste(\"[ERROR] Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", + "\n", + "# log\n", + "log_msg(glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset: {rountine_dataset_name}. Dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", + "metadata": { + "papermill": { + "duration": 0.000138, + "end_time": "2026-01-16T10:23:58.799287", + "exception": false, + "start_time": "2026-01-16T10:23:58.799149", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "log_msg(glue(\"Assessing facility reporting activity based on the following indicators: {paste(ACTIVITY_INDICATORS, collapse=', ')}\"))\n", - "\n", - "facility_master_routine <- left_join(\n", - " facility_master,\n", - " # dhis2_routine %>% select(OU_ID, PERIOD, all_of(DHIS2_INDICATORS)), # GP 2026-02-04\n", - " dhis2_routine %>% select(OU_ID, PERIOD, any_of(DHIS2_INDICATORS)), \n", - " by = c(\"OU_ID\", \"PERIOD\")\n", - " ) %>%\n", - " mutate(\n", - " YEAR = as.numeric(substr(PERIOD, 1, 4)),\n", - " ACTIVE_THIS_PERIOD = ifelse(\n", - " rowSums(!is.na(across(all_of(ACTIVITY_INDICATORS))) & across(all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0), \n", - " COUNT = 1 # Counting every facility\n", - " )" - ], - "execution_count": null, - "outputs": [], - "id": "7b279d27" + "tags": [] + }, + "source": [ + "### 2.2. Organisation units (DHIS2 pyramid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fd92901-901e-4019-be78-a7718050c1c4", + "metadata": { + "papermill": { + "duration": 0.992899, + "end_time": "2026-01-16T10:23:59.792385", + "exception": false, + "start_time": "2026-01-16T10:23:58.799486", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000107, - "end_time": "2026-01-16T10:24:01.626760", - "exception": false, - "start_time": "2026-01-16T10:24:01.626653", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.3. Identify `OPEN` facilities (denominator)\n", - "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", - "\n", - "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", - "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", - "\n", - "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", - "\n", - "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." - ], - "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load file from dataset\n", + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + " \n", + "msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset: `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", + "log_msg(msg)\n", + "dim(dhis2_pyramid_formatted)\n", + "head(dhis2_pyramid_formatted,2)" + ] + }, + { + "cell_type": "markdown", + "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", + "metadata": { + "papermill": { + "duration": 0.000106, + "end_time": "2026-01-16T10:23:59.792710", + "exception": false, + "start_time": "2026-01-16T10:23:59.792604", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 1.317923, - "end_time": "2026-01-16T10:24:02.944800", - "exception": false, - "start_time": "2026-01-16T10:24:01.626877", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "facility_master_routine <- facility_master_routine %>%\n", - " mutate(\n", - " period_date = as.Date(ym(PERIOD)),\n", - " \n", - " # Flag facilities explicitly marked as closed in their name\n", - " NAME_CLOSED = str_detect(\n", - " toupper(OU_NAME),\n", - " \"CLOTUR|FERM(E|EE)?\"\n", - " ),\n", - "\n", - " # Check whether the facility is open during the period using open/close dates\n", - " OPEN_BY_DATE = \n", - " !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date |\n", - " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)\n", - " ),\n", - " \n", - " # Final definition of an open facility for the period:\n", - " # not explicitly closed, within opening/closing dates,\n", - " # and started reporting\n", - " OPEN = ifelse(\n", - " !NAME_CLOSED & OPEN_BY_DATE,\n", - " 1, 0\n", - " )\n", - " )" - ], - "execution_count": null, - "outputs": [], - "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89" + "tags": [] + }, + "source": [ + "### 2.3. Check whether selected indicators are present in routine data\n", + "Extra precaution measure to avoid breaks downstream.
\n", + "\n", + "Note: This logic should be moved to pipeline.py 🐍" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", + "metadata": { + "papermill": { + "duration": 0.024863, + "end_time": "2026-01-16T10:23:59.817677", + "exception": false, + "start_time": "2026-01-16T10:23:59.792814", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" - ], - "id": "657fd6ca" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", + "}\n", + "\n", + "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", + " cat(msg)\n", + " stop(msg)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", + "metadata": { + "papermill": { + "duration": 9.1e-05, + "end_time": "2026-01-16T10:23:59.817949", + "exception": false, + "start_time": "2026-01-16T10:23:59.817858", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - " Important: this step could have a huge influence on reporting rates!
\n", - " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", - "
    \n", - "
  • With YEAR → “active that year”
  • \n", - "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", - "
\n", - "
" - ], - "id": "a598e4b7" + "tags": [] + }, + "source": [ + "## 3. Reporting rates computations" + ] + }, + { + "cell_type": "markdown", + "id": "7d62cdb6", + "metadata": {}, + "source": [ + "#### 3.0. Define start and end period based on routine data " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", + "metadata": { + "papermill": { + "duration": 0.044172, + "end_time": "2026-01-16T10:23:59.862224", + "exception": false, + "start_time": "2026-01-16T10:23:59.818052", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.173961, - "end_time": "2026-01-16T10:24:05.948136", - "exception": false, - "start_time": "2026-01-16T10:24:05.774175", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Flag facilities with at least one report in the year\n", - "facility_master_routine_01 <- facility_master_routine %>%\n", - " group_by(OU_ID, YEAR) %>%\n", - " mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>% # use max() to flag if ACTIVE_THIS_PERIOD is 1 at least once\n", - " ungroup()" - ], - "execution_count": null, - "outputs": [], - "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", + "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", + "\n", + "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", + "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:23:59.862555", + "exception": false, + "start_time": "2026-01-16T10:23:59.862446", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000098, - "end_time": "2026-01-16T10:24:05.948452", - "exception": false, - "start_time": "2026-01-16T10:24:05.948354", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.5. Compute Weighting factor based on \"volume of activity\"" - ], - "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d" + "tags": [] + }, + "source": [ + "#### 3.1. Build master table (all PERIOD x OU)\n", + "The master table contains all combinations of period x organisation unit " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9308197a-0852-4d34-8888-cf5564f35a9d", + "metadata": { + "papermill": { + "duration": 0.289128, + "end_time": "2026-01-16T10:24:00.151791", + "exception": false, + "start_time": "2026-01-16T10:23:59.862663", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.520673, - "end_time": "2026-01-16T10:24:06.469233", - "exception": false, - "start_time": "2026-01-16T10:24:05.948560", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "log_msg(glue(\"Computing volume of activity using indicator: {paste(VOLUME_ACTIVITY_INDICATORS, collapse=', ')}\"))\n", - "\n", - "# Compute MEAN_REPORTED_CASES_BY_HF as total cases over months with activity\n", - "mean_monthly_cases <- dhis2_routine %>% \n", - " mutate(total_cases_by_hf_month = rowSums(across(all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>%\n", - " group_by(ADM2_ID, OU_ID) %>% \n", - " summarise(\n", - " total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE),\n", - " number_of_reporting_months = length(which(total_cases_by_hf_month > 0)),\n", - " .groups = \"drop\"\n", - " ) %>% \n", - " mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>%\n", - " select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF)\n", - "\n", - "mean_monthly_cases_adm2 <- mean_monthly_cases %>% \n", - " select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>% \n", - " group_by(ADM2_ID) %>% \n", - " summarise(SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm=TRUE), \n", - " NR_OF_HF = n())\n", - "\n", - "# Compute weights\n", - "hf_weights <- mean_monthly_cases %>% \n", - " left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", - " mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n", - "\n", - "# Join with rest of data\n", - "facility_master_routine_02 <- facility_master_routine_01 %>%\n", - " left_join(hf_weights %>% select(OU_ID, WEIGHT), by = c(\"OU_ID\"))" - ], - "execution_count": null, - "outputs": [], - "id": "4420e559-4134-4fc3-8950-9972ebede00e" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", + "\n", + "facility_master <- dhis2_pyramid_formatted %>%\n", + " rename(\n", + " OU_ID = glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\"),\n", + " OU_NAME = glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_NAME\"),\n", + " ADM2_ID = str_replace(ADMIN_2, \"NAME\", \"ID\"),\n", + " ADM2_NAME = all_of(ADMIN_2),\n", + " ADM1_ID = str_replace(ADMIN_1, \"NAME\", \"ID\"),\n", + " ADM1_NAME = all_of(ADMIN_1)\n", + " ) %>%\n", + " select(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME, OPENING_DATE, CLOSED_DATE) %>%\n", + " distinct() %>%\n", + " tidyr::crossing(PERIOD = period_vector) %>%\n", + " mutate(PERIOD=as.numeric(PERIOD))\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", + "metadata": { + "papermill": { + "duration": 0.000114, + "end_time": "2026-01-16T10:24:00.152094", + "exception": false, + "start_time": "2026-01-16T10:24:00.151980", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:06.469622", - "exception": false, - "start_time": "2026-01-16T10:24:06.469514", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.6. Compute Weighted variables" - ], - "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3" + "tags": [] + }, + "source": [ + "#### 3.2. Identify \"Active\" facilities\n", + "\n", + "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b279d27", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Assessing facility reporting activity based on the following indicators: {paste(ACTIVITY_INDICATORS, collapse=', ')}\"))\n", + "\n", + "facility_master_routine <- left_join(\n", + " facility_master,\n", + " # dhis2_routine %>% select(OU_ID, PERIOD, all_of(DHIS2_INDICATORS)), # GP 2026-02-04\n", + " dhis2_routine %>% select(OU_ID, PERIOD, any_of(DHIS2_INDICATORS)), \n", + " by = c(\"OU_ID\", \"PERIOD\")\n", + " ) %>%\n", + " mutate(\n", + " YEAR = as.numeric(substr(PERIOD, 1, 4)),\n", + " ACTIVE_THIS_PERIOD = ifelse(\n", + " rowSums(!is.na(across(all_of(ACTIVITY_INDICATORS))) & across(all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0), \n", + " COUNT = 1 # Counting every facility\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", + "metadata": { + "papermill": { + "duration": 0.000107, + "end_time": "2026-01-16T10:24:01.626760", + "exception": false, + "start_time": "2026-01-16T10:24:01.626653", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.483413, - "end_time": "2026-01-16T10:24:06.953139", - "exception": false, - "start_time": "2026-01-16T10:24:06.469726", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "log_msg(glue(\"Computing weighted variables for reporting rate calculation.\"))\n", - "\n", - "facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT \n", - "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n", - "\n", - "dim(facility_master_routine_02)\n", - "head(facility_master_routine_02, 2)" - ], - "execution_count": null, - "outputs": [], - "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259" + "tags": [] + }, + "source": [ + "#### 3.3. Identify `OPEN` facilities (denominator)\n", + "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", + "\n", + "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", + "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", + "\n", + "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", + "\n", + "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", + "metadata": { + "papermill": { + "duration": 1.317923, + "end_time": "2026-01-16T10:24:02.944800", + "exception": false, + "start_time": "2026-01-16T10:24:01.626877", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2026-01-16T10:24:06.953755", - "exception": false, - "start_time": "2026-01-16T10:24:06.953583", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.7. Aggregate data at ADM2 level" - ], - "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "facility_master_routine <- facility_master_routine %>%\n", + " mutate(\n", + " period_date = as.Date(ym(PERIOD)),\n", + " \n", + " # Flag facilities explicitly marked as closed in their name\n", + " NAME_CLOSED = str_detect(\n", + " toupper(OU_NAME),\n", + " \"CLOTUR|FERM(E|EE)?\"\n", + " ),\n", + "\n", + " # Check whether the facility is open during the period using open/close dates\n", + " OPEN_BY_DATE = \n", + " !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date |\n", + " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)\n", + " ),\n", + " \n", + " # Final definition of an open facility for the period:\n", + " # not explicitly closed, within opening/closing dates,\n", + " # and started reporting\n", + " OPEN = ifelse(\n", + " !NAME_CLOSED & OPEN_BY_DATE,\n", + " 1, 0\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "657fd6ca", + "metadata": {}, + "source": [ + "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" + ] + }, + { + "cell_type": "markdown", + "id": "a598e4b7", + "metadata": {}, + "source": [ + "
\n", + " Important: this step could have a huge influence on reporting rates!
\n", + " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", + "
    \n", + "
  • With YEAR → “active that year”
  • \n", + "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", + "
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", + "metadata": { + "papermill": { + "duration": 0.173961, + "end_time": "2026-01-16T10:24:05.948136", + "exception": false, + "start_time": "2026-01-16T10:24:05.774175", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "log_msg(glue(\"Aggregating data at admin level 2.\"))\n", - "\n", - "reporting_rate_adm2 <- facility_master_routine_02 %>% \n", - " group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>%\n", - " summarise(\n", - " HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE), # (numerator) sum of all facilities active per PERIOD\n", - " NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE),\n", - " NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE), # (denominator) sum of all facilities active at least once in the YEAR\n", - " HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE),\n", - " NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE),\n", - " NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE), \n", - " .groups = \"drop\")\n", - "\n", - "dim(reporting_rate_adm2)\n", - "# head(reporting_rate_adm2, 5)" - ], - "execution_count": null, - "outputs": [], - "id": "af13191e" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Flag facilities with at least one report in the year\n", + "facility_master_routine_01 <- facility_master_routine %>%\n", + " group_by(OU_ID, YEAR) %>%\n", + " mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>% # use max() to flag if ACTIVE_THIS_PERIOD is 1 at least once\n", + " ungroup()" + ] + }, + { + "cell_type": "markdown", + "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", + "metadata": { + "papermill": { + "duration": 9.8e-05, + "end_time": "2026-01-16T10:24:05.948452", + "exception": false, + "start_time": "2026-01-16T10:24:05.948354", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3.8. Calculate Reporting Rates (all methods)" - ], - "id": "7d381937" + "tags": [] + }, + "source": [ + "#### 3.5. Compute Weighting factor based on \"volume of activity\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4420e559-4134-4fc3-8950-9972ebede00e", + "metadata": { + "papermill": { + "duration": 0.520673, + "end_time": "2026-01-16T10:24:06.469233", + "exception": false, + "start_time": "2026-01-16T10:24:05.948560", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "log_msg(glue(\"Calculating Reporting Rates at admin level 2. Using all methods, weighted and unweighted.\"))\n", - "\n", - "reporting_rate_adm2 <- reporting_rate_adm2 %>% \n", - " mutate(\n", - " RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2,\n", - " RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2,\n", - " RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2,\n", - " RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED,\n", - " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", - " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", - " )\n", - "\n", - "dim(reporting_rate_adm2)\n", - "head(reporting_rate_adm2, 5)" - ], - "execution_count": null, - "outputs": [], - "id": "b41263f8" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Computing volume of activity using indicator: {paste(VOLUME_ACTIVITY_INDICATORS, collapse=', ')}\"))\n", + "\n", + "# Compute MEAN_REPORTED_CASES_BY_HF as total cases over months with activity\n", + "mean_monthly_cases <- dhis2_routine %>% \n", + " mutate(total_cases_by_hf_month = rowSums(across(all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>%\n", + " group_by(ADM2_ID, OU_ID) %>% \n", + " summarise(\n", + " total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE),\n", + " number_of_reporting_months = length(which(total_cases_by_hf_month > 0)),\n", + " .groups = \"drop\"\n", + " ) %>% \n", + " mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>%\n", + " select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF)\n", + "\n", + "mean_monthly_cases_adm2 <- mean_monthly_cases %>% \n", + " select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>% \n", + " group_by(ADM2_ID) %>% \n", + " summarise(SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm=TRUE), \n", + " NR_OF_HF = n())\n", + "\n", + "# Compute weights\n", + "hf_weights <- mean_monthly_cases %>% \n", + " left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", + " mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n", + "\n", + "# Join with rest of data\n", + "facility_master_routine_02 <- facility_master_routine_01 %>%\n", + " left_join(hf_weights %>% select(OU_ID, WEIGHT), by = c(\"OU_ID\"))" + ] + }, + { + "cell_type": "markdown", + "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:06.469622", + "exception": false, + "start_time": "2026-01-16T10:24:06.469514", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:07.310579", - "exception": false, - "start_time": "2026-01-16T10:24:07.310471", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 4. Select correct col for `REPORTING_RATE` based on denominator method" - ], - "id": "5e593659" + "tags": [] + }, + "source": [ + "#### 3.6. Compute Weighted variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", + "metadata": { + "papermill": { + "duration": 0.483413, + "end_time": "2026-01-16T10:24:06.953139", + "exception": false, + "start_time": "2026-01-16T10:24:06.469726", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000057, - "end_time": "2026-01-16T10:24:07.310743", - "exception": false, - "start_time": "2026-01-16T10:24:07.310686", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 4.1. Select results and format" - ], - "id": "c75f2249" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Computing weighted variables for reporting rate calculation.\"))\n", + "\n", + "facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT \n", + "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n", + "\n", + "dim(facility_master_routine_02)\n", + "head(facility_master_routine_02, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2026-01-16T10:24:06.953755", + "exception": false, + "start_time": "2026-01-16T10:24:06.953583", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.020644, - "end_time": "2026-01-16T10:24:07.351317", - "exception": false, - "start_time": "2026-01-16T10:24:07.330673", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") { \n", - " rr_column_selection <- \"RR_ACTIVE_HF\" \n", - " if (USE_WEIGHTED_REPORTING_RATES) {\n", - " rr_column_selection <- \"RR_ACTIVE_HF_W\"\n", - " }\n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " rr_column_selection <- \"RR_OPEN_HF\"\n", - " if (USE_WEIGHTED_REPORTING_RATES) {\n", - " rr_column_selection <- \"RR_OPEN_HF_W\"\n", - " }\n", - "}" - ], - "execution_count": null, - "outputs": [], - "id": "75e71b38" + "tags": [] + }, + "source": [ + "#### 3.7. Aggregate data at ADM2 level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af13191e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Aggregating data at admin level 2.\"))\n", + "\n", + "reporting_rate_adm2 <- facility_master_routine_02 %>% \n", + " group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>%\n", + " summarise(\n", + " HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE), # (numerator) sum of all facilities active per PERIOD\n", + " NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE),\n", + " NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE), # (denominator) sum of all facilities active at least once in the YEAR\n", + " HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE),\n", + " NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE),\n", + " NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE), \n", + " .groups = \"drop\")\n", + "\n", + "dim(reporting_rate_adm2)\n", + "# head(reporting_rate_adm2, 5)" + ] + }, + { + "cell_type": "markdown", + "id": "7d381937", + "metadata": {}, + "source": [ + "#### 3.8. Calculate Reporting Rates (all methods)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b41263f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Calculating Reporting Rates at admin level 2. Using all methods, weighted and unweighted.\"))\n", + "\n", + "reporting_rate_adm2 <- reporting_rate_adm2 %>% \n", + " mutate(\n", + " RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2,\n", + " RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2,\n", + " RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2,\n", + " RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED,\n", + " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", + " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", + " )\n", + "\n", + "dim(reporting_rate_adm2)\n", + "head(reporting_rate_adm2, 5)" + ] + }, + { + "cell_type": "markdown", + "id": "5e593659", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:07.310579", + "exception": false, + "start_time": "2026-01-16T10:24:07.310471", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.140976, - "end_time": "2026-01-16T10:24:07.492479", - "exception": false, - "start_time": "2026-01-16T10:24:07.351503", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "log_msg(glue(\"Using reporting rate column: `{rr_column_selection}` \n", - "based on DATAELEMENT_METHOD_DENOMINATOR == {DATAELEMENT_METHOD_DENOMINATOR} \n", - "and USE_WEIGHTED_REPORTING_RATES == {USE_WEIGHTED_REPORTING_RATES}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "3df36abb" + "tags": [] + }, + "source": [ + "## 4. Select correct col for `REPORTING_RATE` based on denominator method" + ] + }, + { + "cell_type": "markdown", + "id": "c75f2249", + "metadata": { + "papermill": { + "duration": 5.7e-05, + "end_time": "2026-01-16T10:24:07.310743", + "exception": false, + "start_time": "2026-01-16T10:24:07.310686", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.182574, - "end_time": "2026-01-16T10:24:07.675242", - "exception": false, - "start_time": "2026-01-16T10:24:07.492668", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "log_msg(glue(\"Formatting table for '{DATAELEMENT_METHOD_DENOMINATOR}' selection.\"))\n", - "\n", - "# Select column and format final table\n", - "reporting_rate_dataelement <- reporting_rate_adm2 %>%\n", - " mutate(MONTH = PERIOD %% 100) %>%\n", - " rename(REPORTING_RATE = !!sym(rr_column_selection)) %>%\n", - " select(all_of(fixed_cols_rr))\n", - "\n", - "print(dim(reporting_rate_dataelement))\n", - "head(reporting_rate_dataelement, 3)" - ], - "execution_count": null, - "outputs": [], - "id": "0ccc272c" + "tags": [] + }, + "source": [ + "### 4.1. Select results and format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75e71b38", + "metadata": { + "papermill": { + "duration": 0.020644, + "end_time": "2026-01-16T10:24:07.351317", + "exception": false, + "start_time": "2026-01-16T10:24:07.330673", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:24:07.675637", - "exception": false, - "start_time": "2026-01-16T10:24:07.675528", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 5. Inspect reporting rate values" - ], - "id": "ca66e785" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") { \n", + " rr_column_selection <- \"RR_ACTIVE_HF\" \n", + " if (USE_WEIGHTED_REPORTING_RATES) {\n", + " rr_column_selection <- \"RR_ACTIVE_HF_W\"\n", + " }\n", + "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " rr_column_selection <- \"RR_OPEN_HF\"\n", + " if (USE_WEIGHTED_REPORTING_RATES) {\n", + " rr_column_selection <- \"RR_OPEN_HF_W\"\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3df36abb", + "metadata": { + "papermill": { + "duration": 0.140976, + "end_time": "2026-01-16T10:24:07.492479", + "exception": false, + "start_time": "2026-01-16T10:24:07.351503", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.160299, - "end_time": "2026-01-16T10:24:07.836039", - "exception": false, - "start_time": "2026-01-16T10:24:07.675740", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", - "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", - "xlab=\"REPORTING_RATE\")" - ], - "execution_count": null, - "outputs": [], - "id": "31535459" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Using reporting rate column: `{rr_column_selection}` \n", + "based on DATAELEMENT_METHOD_DENOMINATOR == {DATAELEMENT_METHOD_DENOMINATOR} \n", + "and USE_WEIGHTED_REPORTING_RATES == {USE_WEIGHTED_REPORTING_RATES}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ccc272c", + "metadata": { + "papermill": { + "duration": 0.182574, + "end_time": "2026-01-16T10:24:07.675242", + "exception": false, + "start_time": "2026-01-16T10:24:07.492668", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.896382, - "end_time": "2026-01-16T10:24:08.732660", - "exception": false, - "start_time": "2026-01-16T10:24:07.836278", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Boxplot\n", - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - " geom_boxplot(outlier.alpha = 0.3) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ], - "execution_count": null, - "outputs": [], - "id": "6778f17d" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Formatting table for '{DATAELEMENT_METHOD_DENOMINATOR}' selection.\"))\n", + "\n", + "# Select column and format final table\n", + "reporting_rate_dataelement <- reporting_rate_adm2 %>%\n", + " mutate(MONTH = PERIOD %% 100) %>%\n", + " rename(REPORTING_RATE = !!sym(rr_column_selection)) %>%\n", + " select(all_of(fixed_cols_rr))\n", + "\n", + "print(dim(reporting_rate_dataelement))\n", + "head(reporting_rate_dataelement, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "ca66e785", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:24:07.675637", + "exception": false, + "start_time": "2026-01-16T10:24:07.675528", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.859448, - "end_time": "2026-01-16T10:24:09.592295", - "exception": false, - "start_time": "2026-01-16T10:24:08.732847", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - "# Boxplot without outliers\n", - " geom_boxplot(outlier.alpha = 0) +\n", - " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ], - "execution_count": null, - "outputs": [], - "id": "a7f013fd" + "tags": [] + }, + "source": [ + "## 5. Inspect reporting rate values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31535459", + "metadata": { + "papermill": { + "duration": 0.160299, + "end_time": "2026-01-16T10:24:07.836039", + "exception": false, + "start_time": "2026-01-16T10:24:07.675740", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.000088, - "end_time": "2026-01-16T10:24:09.592563", - "exception": false, - "start_time": "2026-01-16T10:24:09.592475", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 5. 📁 Export to `data/` folder" - ], - "id": "2866816a-7015-4c5c-b904-f553f3b4790d" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", + "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", + "xlab=\"REPORTING_RATE\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6778f17d", + "metadata": { + "papermill": { + "duration": 0.896382, + "end_time": "2026-01-16T10:24:08.732660", + "exception": false, + "start_time": "2026-01-16T10:24:07.836278", + "status": "completed" }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.919937, - "end_time": "2026-01-16T10:24:10.512602", - "exception": false, - "start_time": "2026-01-16T10:24:09.592665", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", - "\n", - "# parquet\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\"))\n", - "write_parquet(reporting_rate_dataelement, file_path)\n", - "log_msg(glue(\"Exported : {file_path}\"))\n", - "\n", - "# csv\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\"))\n", - "write.csv(reporting_rate_dataelement, file_path, row.names = FALSE)\n", - "log_msg(glue(\"Exported : {file_path}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "bbf27852-8ec5-4370-aae2-49e082928fe1" + "tags": [], + "vscode": { + "languageId": "r" } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + }, + "outputs": [], + "source": [ + "# Boxplot\n", + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + " geom_boxplot(outlier.alpha = 0.3) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7f013fd", + "metadata": { + "papermill": { + "duration": 0.859448, + "end_time": "2026-01-16T10:24:09.592295", + "exception": false, + "start_time": "2026-01-16T10:24:08.732847", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + "# Boxplot without outliers\n", + " geom_boxplot(outlier.alpha = 0) +\n", + " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "markdown", + "id": "2866816a-7015-4c5c-b904-f553f3b4790d", + "metadata": { + "papermill": { + "duration": 8.8e-05, + "end_time": "2026-01-16T10:24:09.592563", + "exception": false, + "start_time": "2026-01-16T10:24:09.592475", + "status": "completed" }, + "tags": [] + }, + "source": [ + "## 5. 📁 Export to `data/` folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", + "metadata": { "papermill": { - "default_parameters": {}, - "duration": 81.158347, - "end_time": "2026-01-16T10:24:10.736106", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", - "parameters": { - "AVAILABILITY_INDICATORS": [ - "CONF", - "PRES", - "SUSP", - "TEST" - ], - "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", - "ROUTINE_FILE": "NER_routine_outliers-mean_removed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace", - "USE_WEIGHTED_REPORTING_RATES": true, - "VOLUME_ACTIVITY_INDICATORS": [ - "CONF", - "PRES" - ] - }, - "start_time": "2026-01-16T10:22:49.577759", - "version": "2.6.0" + "duration": 0.919937, + "end_time": "2026-01-16T10:24:10.512602", + "exception": false, + "start_time": "2026-01-16T10:24:09.592665", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", + "\n", + "# parquet\n", + "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\"))\n", + "write_parquet(reporting_rate_dataelement, file_path)\n", + "log_msg(glue(\"Exported : {file_path}\"))\n", + "\n", + "# csv\n", + "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\"))\n", + "write.csv(reporting_rate_dataelement, file_path, row.names = FALSE)\n", + "log_msg(glue(\"Exported : {file_path}\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "papermill": { + "default_parameters": {}, + "duration": 81.158347, + "end_time": "2026-01-16T10:24:10.736106", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", + "parameters": { + "AVAILABILITY_INDICATORS": [ + "CONF", + "PRES", + "SUSP", + "TEST" + ], + "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", + "ROUTINE_FILE": "NER_routine_outliers_removed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace", + "USE_WEIGHTED_REPORTING_RATES": true, + "VOLUME_ACTIVITY_INDICATORS": [ + "CONF", + "PRES" + ] + }, + "start_time": "2026-01-16T10:22:49.577759", + "version": "2.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index e9885f8..dcac610 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -1,1205 +1,1205 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "30bf8dfc", - "metadata": {}, - "source": [ - "# **Dataset Reporting Rate: Calculation Based on DHIS2 Extracted Data**\n", - "\n", - "The **reporting rate** measures the proportion of registered health facilities that submit data. It is calculated for each administrative level 2 (`ADM2`) area and for each reporting period (`PERIOD` in YYYYMM format).\n", - "
\n", - "\n", - "**Dataset Selection**
\n", - "The choice of dataset(s) used for reporting rate calculation is controlled by modifying the SNT_config.json configuration file. This allows flexible selection among multiple datasets extracted from the same DHIS2 instance.\n", - "\n", - "**Calculation Logic**
\n", - "From the selected dataset(s):\n", - "- **Numerator:** Number of facilities that _actually_ reported, derived from the element \"ACTUAL_REPORTS\".\n", - "- **Denominator:** Number of facilities _expected_ to report, derived from the element \"EXPECTED_REPORTS\".\n", - "\n", - "After aggregating these counts at the ADM2 level, the reporting rate is computed as:\n", - "
\n", - "REPORTING RATE = ACTUAL_REPORTS / EXPECTED_REPORTS\n", - "
\n", - "and expressed as a **proportion** between 0 and 1.\n", - "
\n", - "\n", - "-----\n", - "\n", - "### Additional Data Processing Steps\n", - "\n", - "- **Handling Multiple Datasets:** \n", - " When multiple datasets are available, the pipeline uses only those specified in SNT_config.json. For these selected datasets, the counts of actual and expected reports are summed by ADM2 area.\n", - "\n", - "- **Deduplication of Entries:** \n", - " Sometimes, the same organizational unit (OU_ID) may appear in multiple datasets for the same period, risking double counting. To address this, deduplication is performed by keeping only the entry with the **highest** ACTUAL_REPORTS value for each unique combination of OU_ID and PERIOD. \n", - "
    \n", - "
  • Why keep the highest? Because ACTUAL_REPORTS values are binary (0 or 1). If duplicates agree (all 0 or all 1), keeping one suffices. If they differ (some 0, some 1), keeping the 1 ensures that presence of a report is not missed.
  • \n", - "
  • 🚨Important: Deduplication only proceeds if all duplicated values are within {0,1}. If other values are present, deduplication is skipped with a warning to avoid incorrect data handling.
  • \n", - "
\n", - "\n", - "-----\n", - "\n", - "\n", - "### 🇳🇪 Niger-Specific Processing: \n", - " In Niger, datasets for HOP (hospital) facilities are already **pre-aggregated** and may contain values greater than 1 for actual or expected reports, reflecting subunits or departments within a hospital. \n", - "
\n", - " To accurately represent reporting at the facility level and avoid overcounting, all values greater than 1 are converted to 1 (presence/absence). This ensures that the reporting rate reflects whether the hospital as a whole reported, rather than counting multiple subunits separately. This step also prevents cases where ACTUAL_REPORTS exceeds EXPECTED_REPORTS.\n", - "\n", - "------\n", - "\n", - "### Pipeline parameters\n", - "\n", - "- **Routine data source**: Select the routine dataset variant used for reporting rate computation.\n", - "\n", - "- **`raw`**: Loads routine data from the formatted dataset.\n", - "\n", - "- **`imputed`**: Loads routine data from the outliers dataset using imputed values.\n", - "\n", - "- **`outliers_removed`**: Loads routine data from the outliers dataset after outliers removal." - ] + "cells": [ + { + "cell_type": "markdown", + "id": "30bf8dfc", + "metadata": {}, + "source": [ + "# **Dataset Reporting Rate: Calculation Based on DHIS2 Extracted Data**\n", + "\n", + "The **reporting rate** measures the proportion of registered health facilities that submit data. It is calculated for each administrative level 2 (`ADM2`) area and for each reporting period (`PERIOD` in YYYYMM format).\n", + "
\n", + "\n", + "**Dataset Selection**
\n", + "The choice of dataset(s) used for reporting rate calculation is controlled by modifying the SNT_config.json configuration file. This allows flexible selection among multiple datasets extracted from the same DHIS2 instance.\n", + "\n", + "**Calculation Logic**
\n", + "From the selected dataset(s):\n", + "- **Numerator:** Number of facilities that _actually_ reported, derived from the element \"ACTUAL_REPORTS\".\n", + "- **Denominator:** Number of facilities _expected_ to report, derived from the element \"EXPECTED_REPORTS\".\n", + "\n", + "After aggregating these counts at the ADM2 level, the reporting rate is computed as:\n", + "
\n", + "REPORTING RATE = ACTUAL_REPORTS / EXPECTED_REPORTS\n", + "
\n", + "and expressed as a **proportion** between 0 and 1.\n", + "
\n", + "\n", + "-----\n", + "\n", + "### Additional Data Processing Steps\n", + "\n", + "- **Handling Multiple Datasets:** \n", + " When multiple datasets are available, the pipeline uses only those specified in SNT_config.json. For these selected datasets, the counts of actual and expected reports are summed by ADM2 area.\n", + "\n", + "- **Deduplication of Entries:** \n", + " Sometimes, the same organizational unit (OU_ID) may appear in multiple datasets for the same period, risking double counting. To address this, deduplication is performed by keeping only the entry with the **highest** ACTUAL_REPORTS value for each unique combination of OU_ID and PERIOD. \n", + "
    \n", + "
  • Why keep the highest? Because ACTUAL_REPORTS values are binary (0 or 1). If duplicates agree (all 0 or all 1), keeping one suffices. If they differ (some 0, some 1), keeping the 1 ensures that presence of a report is not missed.
  • \n", + "
  • 🚨Important: Deduplication only proceeds if all duplicated values are within {0,1}. If other values are present, deduplication is skipped with a warning to avoid incorrect data handling.
  • \n", + "
\n", + "\n", + "-----\n", + "\n", + "\n", + "### 🇳🇪 Niger-Specific Processing: \n", + " In Niger, datasets for HOP (hospital) facilities are already **pre-aggregated** and may contain values greater than 1 for actual or expected reports, reflecting subunits or departments within a hospital. \n", + "
\n", + " To accurately represent reporting at the facility level and avoid overcounting, all values greater than 1 are converted to 1 (presence/absence). This ensures that the reporting rate reflects whether the hospital as a whole reported, rather than counting multiple subunits separately. This step also prevents cases where ACTUAL_REPORTS exceeds EXPECTED_REPORTS.\n", + "\n", + "------\n", + "\n", + "### Pipeline parameters\n", + "\n", + "- **Routine data source**: Select the routine dataset variant used for reporting rate computation.\n", + "\n", + "- **`raw`**: Loads routine data from the formatted dataset.\n", + "\n", + "- **`imputed`**: Loads routine data from the outliers dataset using imputed values.\n", + "\n", + "- **`outliers_removed`**: Loads routine data from the outliers dataset after outliers removal." + ] + }, + { + "cell_type": "markdown", + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", + "metadata": { + "papermill": { + "duration": 9.2e-05, + "end_time": "2025-12-19T10:21:50.273573", + "exception": false, + "start_time": "2025-12-19T10:21:50.273481", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35ede7cf-257f-439c-a514-26a7290f881d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:21:50.332786Z", + "iopub.status.busy": "2025-12-19T10:21:50.277536Z", + "iopub.status.idle": "2025-12-19T10:23:03.339080Z", + "shell.execute_reply": "2025-12-19T10:23:03.336413Z" + }, + "papermill": { + "duration": 73.068006, + "end_time": "2025-12-19T10:23:03.341764", + "exception": false, + "start_time": "2025-12-19T10:21:50.273758", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Project paths\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "\n", + "# Load libraries \n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"glue\", \"jsonlite\", \"httr\", \"reticulate\") \n", + "install_and_load(required_packages)\n", + "\n", + "# Environment variables\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load OpenHEXA sdk\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "markdown", + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", + "metadata": { + "papermill": { + "duration": 0.00017, + "end_time": "2025-12-19T10:23:03.342235", + "exception": false, + "start_time": "2025-12-19T10:23:03.342065", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.1. Load and check `config_json` file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:03.351367Z", + "iopub.status.busy": "2025-12-19T10:23:03.348819Z", + "iopub.status.idle": "2025-12-19T10:23:03.979814Z", + "shell.execute_reply": "2025-12-19T10:23:03.976617Z" + }, + "papermill": { + "duration": 0.640406, + "end_time": "2025-12-19T10:23:03.982829", + "exception": false, + "start_time": "2025-12-19T10:23:03.342423", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:03.987632Z", + "iopub.status.busy": "2025-12-19T10:23:03.985301Z", + "iopub.status.idle": "2025-12-19T10:23:04.011308Z", + "shell.execute_reply": "2025-12-19T10:23:04.008941Z" + }, + "papermill": { + "duration": 0.031002, + "end_time": "2025-12-19T10:23:04.014107", + "exception": false, + "start_time": "2025-12-19T10:23:03.983105", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# Which reporting rate PRODUCT_UID to use (DHIS2 dataset id)\n", + "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID \n", + "\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" + ] + }, + { + "cell_type": "markdown", + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", + "metadata": { + "papermill": { + "duration": 0.00015, + "end_time": "2025-12-19T10:23:04.014523", + "exception": false, + "start_time": "2025-12-19T10:23:04.014373", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.2. Validate parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.019283Z", + "iopub.status.busy": "2025-12-19T10:23:04.017257Z", + "iopub.status.idle": "2025-12-19T10:23:04.039652Z", + "shell.execute_reply": "2025-12-19T10:23:04.037292Z" + }, + "papermill": { + "duration": 0.02788, + "end_time": "2025-12-19T10:23:04.042642", + "exception": false, + "start_time": "2025-12-19T10:23:04.014762", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", - "metadata": { - "papermill": { - "duration": 0.000092, - "end_time": "2025-12-19T10:21:50.273573", - "exception": false, - "start_time": "2025-12-19T10:21:50.273481", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# default: raw routine\n", + "if (!exists(\"ROUTINE_FILE\")) { ROUTINE_FILE <- glue::glue(\"{COUNTRY_CODE}_routine.parquet\") }" + ] + }, + { + "cell_type": "markdown", + "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650", + "metadata": { + "papermill": { + "duration": 0.000144, + "end_time": "2025-12-19T10:23:04.043066", + "exception": false, + "start_time": "2025-12-19T10:23:04.042922", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:21:50.332786Z", - "iopub.status.busy": "2025-12-19T10:21:50.277536Z", - "iopub.status.idle": "2025-12-19T10:23:03.339080Z", - "shell.execute_reply": "2025-12-19T10:23:03.336413Z" - }, - "papermill": { - "duration": 73.068006, - "end_time": "2025-12-19T10:23:03.341764", - "exception": false, - "start_time": "2025-12-19T10:21:50.273758", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"glue\", \"jsonlite\", \"httr\", \"reticulate\") \n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ] + "tags": [] + }, + "source": [ + "#### 1.3. 🔍 Check REPORTING_RATE_PRODUCT_ID is configured" + ] + }, + { + "cell_type": "markdown", + "id": "682a62d5", + "metadata": {}, + "source": [ + "### 🐍 This probably to be moved to pipeline.py code?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7469898d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.047782Z", + "iopub.status.busy": "2025-12-19T10:23:04.045631Z", + "iopub.status.idle": "2025-12-19T10:23:04.545551Z", + "shell.execute_reply": "2025-12-19T10:23:04.542372Z" }, - { - "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", - "metadata": { - "papermill": { - "duration": 0.00017, - "end_time": "2025-12-19T10:23:03.342235", - "exception": false, - "start_time": "2025-12-19T10:23:03.342065", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.1. Load and check `config_json` file" - ] + "papermill": { + "duration": 0.505908, + "end_time": "2025-12-19T10:23:04.549148", + "exception": false, + "start_time": "2025-12-19T10:23:04.043240", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:03.351367Z", - "iopub.status.busy": "2025-12-19T10:23:03.348819Z", - "iopub.status.idle": "2025-12-19T10:23:03.979814Z", - "shell.execute_reply": "2025-12-19T10:23:03.976617Z" - }, - "papermill": { - "duration": 0.640406, - "end_time": "2025-12-19T10:23:03.982829", - "exception": false, - "start_time": "2025-12-19T10:23:03.342423", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if REPORTING_RATE_PRODUCT_ID is configured\n", + "if (is.null(REPORTING_RATE_PRODUCT_ID) || length(REPORTING_RATE_PRODUCT_ID) == 0) {\n", + " log_msg(\"🚨 Warning: REPORTING_RATE_PRODUCT_ID is not configured properly in 'SNT_config.json'. \n", + " This will prevent filtering by reporting dataset, and all values will be retained.\", level = \"warning\" )\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", + "metadata": { + "papermill": { + "duration": 0.000139, + "end_time": "2025-12-19T10:23:04.549558", + "exception": false, + "start_time": "2025-12-19T10:23:04.549419", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:03.987632Z", - "iopub.status.busy": "2025-12-19T10:23:03.985301Z", - "iopub.status.idle": "2025-12-19T10:23:04.011308Z", - "shell.execute_reply": "2025-12-19T10:23:04.008941Z" - }, - "papermill": { - "duration": 0.031002, - "end_time": "2025-12-19T10:23:04.014107", - "exception": false, - "start_time": "2025-12-19T10:23:03.983105", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# Which reporting rate PRODUCT_UID to use (DHIS2 dataset id)\n", - "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID \n", - "\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ] + "tags": [] + }, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", + "metadata": { + "papermill": { + "duration": 0.000152, + "end_time": "2025-12-19T10:23:04.549924", + "exception": false, + "start_time": "2025-12-19T10:23:04.549772", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", - "metadata": { - "papermill": { - "duration": 0.00015, - "end_time": "2025-12-19T10:23:04.014523", - "exception": false, - "start_time": "2025-12-19T10:23:04.014373", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.2. Validate parameters" - ] + "tags": [] + }, + "source": [ + "### 2.1. Load routine data (DHIS2) \n", + "Already formatted routine data, we use this as the master table
\n", + "(only used at the very end before exporting the table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1213723-f7e2-4238-9f37-f1795b187232", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.554212Z", + "iopub.status.busy": "2025-12-19T10:23:04.552423Z", + "iopub.status.idle": "2025-12-19T10:23:05.773324Z", + "shell.execute_reply": "2025-12-19T10:23:05.771316Z" }, - { - "cell_type": "code", - "execution_count": null, - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.019283Z", - "iopub.status.busy": "2025-12-19T10:23:04.017257Z", - "iopub.status.idle": "2025-12-19T10:23:04.039652Z", - "shell.execute_reply": "2025-12-19T10:23:04.037292Z" - }, - "papermill": { - "duration": 0.02788, - "end_time": "2025-12-19T10:23:04.042642", - "exception": false, - "start_time": "2025-12-19T10:23:04.014762", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# default: raw routine\n", - "if (!exists(\"ROUTINE_FILE\")) { ROUTINE_FILE <- glue::glue(\"{COUNTRY_CODE}_routine.parquet\") }" - ] + "papermill": { + "duration": 1.225668, + "end_time": "2025-12-19T10:23:05.775768", + "exception": false, + "start_time": "2025-12-19T10:23:04.550100", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650", - "metadata": { - "papermill": { - "duration": 0.000144, - "end_time": "2025-12-19T10:23:04.043066", - "exception": false, - "start_time": "2025-12-19T10:23:04.042922", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.3. 🔍 Check REPORTING_RATE_PRODUCT_ID is configured" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# select dataset\n", + "if (ROUTINE_FILE == glue::glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", + " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "} else {\n", + " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", + "}\n", + "\n", + "# Load file from dataset\n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", + "\n", + "# Subset data to keep only columns defined in fixed_cols_rr (if defined)\n", + "if (exists(\"fixed_cols_rr\")) {\n", + " dhis2_routine <- dhis2_routine %>% \n", + " select(any_of(fixed_cols_rr)) |> \n", + " distinct()\n", + "}\n", + "\n", + "# log\n", + "log_msg(glue::glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset : {rountine_dataset_name} dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452", + "metadata": { + "papermill": { + "duration": 0.000155, + "end_time": "2025-12-19T10:23:05.776205", + "exception": false, + "start_time": "2025-12-19T10:23:05.776050", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "682a62d5", - "metadata": {}, - "source": [ - "### 🐍 This probably to be moved to pipeline.py code?" - ] + "tags": [] + }, + "source": [ + "### 2.2. Load Reporting Rate data (DHIS2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e352c76-f2fb-43ba-b85d-391d808057a8", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:05.780487Z", + "iopub.status.busy": "2025-12-19T10:23:05.778651Z", + "iopub.status.idle": "2025-12-19T10:23:07.096742Z", + "shell.execute_reply": "2025-12-19T10:23:07.094774Z" }, - { - "cell_type": "code", - "execution_count": null, - "id": "7469898d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.047782Z", - "iopub.status.busy": "2025-12-19T10:23:04.045631Z", - "iopub.status.idle": "2025-12-19T10:23:04.545551Z", - "shell.execute_reply": "2025-12-19T10:23:04.542372Z" - }, - "papermill": { - "duration": 0.505908, - "end_time": "2025-12-19T10:23:04.549148", - "exception": false, - "start_time": "2025-12-19T10:23:04.043240", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if REPORTING_RATE_PRODUCT_ID is configured\n", - "if (is.null(REPORTING_RATE_PRODUCT_ID) || length(REPORTING_RATE_PRODUCT_ID) == 0) {\n", - " log_msg(\"🚨 Warning: REPORTING_RATE_PRODUCT_ID is not configured properly in 'SNT_config.json'. \n", - " This will prevent filtering by reporting dataset, and all values will be retained.\", level = \"warning\" )\n", - "}" - ] + "papermill": { + "duration": 1.322737, + "end_time": "2025-12-19T10:23:07.099136", + "exception": false, + "start_time": "2025-12-19T10:23:05.776399", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\") # reporting rate file\n", + "\n", + "# Load file from dataset\n", + "dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", + " error = function(e) {\n", + " msg <- paste(\"[ERROR] Error while loading DHIS2 dataset reporting rates file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "dhis2_reporting <- dhis2_reporting %>% mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric)) # numeric values\n", + "\n", + "msg <- paste0(\"DHIS2 Datatset reporting data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). \n", + "Dataframe dimensions: \", \n", + " paste(dim(dhis2_reporting), collapse=\", \"))\n", + "log_msg(msg)\n", + "head(dhis2_reporting, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "4d5f398b", + "metadata": { + "papermill": { + "duration": 0.000151, + "end_time": "2025-12-19T10:23:07.099531", + "exception": false, + "start_time": "2025-12-19T10:23:07.099380", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 3. Transform reporting data" + ] + }, + { + "cell_type": "markdown", + "id": "adcbee0b", + "metadata": { + "papermill": { + "duration": 0.0001, + "end_time": "2025-12-19T10:23:07.099849", + "exception": false, + "start_time": "2025-12-19T10:23:07.099749", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.1. Filter Reporting Rate data by \"Dataset\" (`PRODUCT_UID`)\n", + "Logic:\n", + "* Value(s) (string) for `PRODUCT_UID` defined in the config.json file\n", + "* If none provided (**empty** field) skip filtering and **keep everything**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "795a5e74", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:07.104617Z", + "iopub.status.busy": "2025-12-19T10:23:07.102475Z", + "iopub.status.idle": "2025-12-19T10:23:08.406561Z", + "shell.execute_reply": "2025-12-19T10:23:08.404419Z" }, - { - "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", - "metadata": { - "papermill": { - "duration": 0.000139, - "end_time": "2025-12-19T10:23:04.549558", - "exception": false, - "start_time": "2025-12-19T10:23:04.549419", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ] + "papermill": { + "duration": 1.309322, + "end_time": "2025-12-19T10:23:08.409343", + "exception": false, + "start_time": "2025-12-19T10:23:07.100021", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", - "metadata": { - "papermill": { - "duration": 0.000152, - "end_time": "2025-12-19T10:23:04.549924", - "exception": false, - "start_time": "2025-12-19T10:23:04.549772", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.1. Load routine data (DHIS2) \n", - "Already formatted routine data, we use this as the master table
\n", - "(only used at the very end before exporting the table)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if REPORTING_RATE_PRODUCT_ID present in the data: if yes, filter to keep only those, else skip filtering (keep all) and log a warning\n", + "if (all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", + " dhis2_reporting <- dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", + " log_msg(glue::glue(\"🪮 Filtering DHIS2 reporting data to keep only values for REPORTING_RATE_PRODUCT_UID(s): {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')}.\n", + " Removed {nrow(dhis2_reporting) - nrow(dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID))} rows.\n", + " Dataframe dimensions after filtering: {paste(dim(dhis2_reporting), collapse=', ')}\"))\n", + "} else {\n", + " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data PRODUCT_UIDs: {paste(unique(dhis2_reporting$PRODUCT_UID), collapse=', ')}. \n", + " 🦘 Skipping filtering and keeping all data. Dataframe dimensions: {paste(dim(dhis2_reporting), collapse=', ')}\"), level = \"warning\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "4237408a", + "metadata": { + "papermill": { + "duration": 0.000133, + "end_time": "2025-12-19T10:23:08.409660", + "exception": false, + "start_time": "2025-12-19T10:23:08.409527", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.554212Z", - "iopub.status.busy": "2025-12-19T10:23:04.552423Z", - "iopub.status.idle": "2025-12-19T10:23:05.773324Z", - "shell.execute_reply": "2025-12-19T10:23:05.771316Z" - }, - "papermill": { - "duration": 1.225668, - "end_time": "2025-12-19T10:23:05.775768", - "exception": false, - "start_time": "2025-12-19T10:23:04.550100", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# select dataset\n", - "if (ROUTINE_FILE == glue::glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "} else {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - "}\n", - "\n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", - "\n", - "# Subset data to keep only columns defined in fixed_cols_rr (if defined)\n", - "if (exists(\"fixed_cols_rr\")) {\n", - " dhis2_routine <- dhis2_routine %>% \n", - " select(any_of(fixed_cols_rr)) |> \n", - " distinct()\n", - "}\n", - "\n", - "# log\n", - "log_msg(glue::glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset : {rountine_dataset_name} dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 3)" - ] + "tags": [] + }, + "source": [ + "### 3.2. Pivot wider" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c3b9a65", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:08.413415Z", + "iopub.status.busy": "2025-12-19T10:23:08.411805Z", + "iopub.status.idle": "2025-12-19T10:23:08.884793Z", + "shell.execute_reply": "2025-12-19T10:23:08.880916Z" }, - { - "cell_type": "markdown", - "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452", - "metadata": { - "papermill": { - "duration": 0.000155, - "end_time": "2025-12-19T10:23:05.776205", - "exception": false, - "start_time": "2025-12-19T10:23:05.776050", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.2. Load Reporting Rate data (DHIS2)" - ] + "papermill": { + "duration": 0.479538, + "end_time": "2025-12-19T10:23:08.889341", + "exception": false, + "start_time": "2025-12-19T10:23:08.409803", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e352c76-f2fb-43ba-b85d-391d808057a8", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:05.780487Z", - "iopub.status.busy": "2025-12-19T10:23:05.778651Z", - "iopub.status.idle": "2025-12-19T10:23:07.096742Z", - "shell.execute_reply": "2025-12-19T10:23:07.094774Z" - }, - "papermill": { - "duration": 1.322737, - "end_time": "2025-12-19T10:23:07.099136", - "exception": false, - "start_time": "2025-12-19T10:23:05.776399", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\") # reporting rate file\n", - "\n", - "# Load file from dataset\n", - "dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 dataset reporting rates file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "dhis2_reporting <- dhis2_reporting %>% mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric)) # numeric values\n", - "\n", - "msg <- paste0(\"DHIS2 Datatset reporting data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). \n", - "Dataframe dimensions: \", \n", - " paste(dim(dhis2_reporting), collapse=\", \"))\n", - "log_msg(msg)\n", - "head(dhis2_reporting, 3)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Pivot wider to have one column per PRODUCT_METRIC (which now indicates whether the VALUE is \"ACTUAL_REPORTS\" or \"EXPECTED_REPORTS\")\n", + "dhis2_reporting_wide <- dhis2_reporting %>%\n", + " pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", + "\n", + "# Log msg\n", + "log_msg(glue::glue(\"Pivoted DHIS2 reporting data to wide format, with one column per PRODUCT_METRIC (ACTUAL_REPORTS, EXPECTED_REPORTS).\n", + "Dimensions after pivot: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", + "\n", + "dim(dhis2_reporting_wide)\n", + "head(dhis2_reporting_wide, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "0f485148", + "metadata": { + "papermill": { + "duration": 0.000186, + "end_time": "2025-12-19T10:23:08.889829", + "exception": false, + "start_time": "2025-12-19T10:23:08.889643", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "4d5f398b", - "metadata": { - "papermill": { - "duration": 0.000151, - "end_time": "2025-12-19T10:23:07.099531", - "exception": false, - "start_time": "2025-12-19T10:23:07.099380", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 3. Transform reporting data" - ] + "tags": [] + }, + "source": [ + "### 👯 Handle **duplicated** values (`OU_ID`)\n", + "Using multiple datasets relies on the **assumption** that **each dataset is complementary to the other(s)**. Namely, there should be no \"dupliacted\" orgunits that are counted in more than one dataset! Else, we would be **double counting**." + ] + }, + { + "cell_type": "markdown", + "id": "55dececa", + "metadata": { + "papermill": { + "duration": 0.000122, + "end_time": "2025-12-19T10:23:08.890157", + "exception": false, + "start_time": "2025-12-19T10:23:08.890035", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "adcbee0b", - "metadata": { - "papermill": { - "duration": 0.0001, - "end_time": "2025-12-19T10:23:07.099849", - "exception": false, - "start_time": "2025-12-19T10:23:07.099749", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.1. Filter Reporting Rate data by \"Dataset\" (`PRODUCT_UID`)\n", - "Logic:\n", - "* Value(s) (string) for `PRODUCT_UID` defined in the config.json file\n", - "* If none provided (**empty** field) skip filtering and **keep everything**" - ] + "tags": [] + }, + "source": [ + "#### Check for duplicated values (`OU_ID`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d761bd15", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:08.899486Z", + "iopub.status.busy": "2025-12-19T10:23:08.894706Z", + "iopub.status.idle": "2025-12-19T10:23:09.476248Z", + "shell.execute_reply": "2025-12-19T10:23:09.470283Z" }, - { - "cell_type": "code", - "execution_count": null, - "id": "795a5e74", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:07.104617Z", - "iopub.status.busy": "2025-12-19T10:23:07.102475Z", - "iopub.status.idle": "2025-12-19T10:23:08.406561Z", - "shell.execute_reply": "2025-12-19T10:23:08.404419Z" - }, - "papermill": { - "duration": 1.309322, - "end_time": "2025-12-19T10:23:08.409343", - "exception": false, - "start_time": "2025-12-19T10:23:07.100021", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if REPORTING_RATE_PRODUCT_ID present in the data: if yes, filter to keep only those, else skip filtering (keep all) and log a warning\n", - "if (all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", - " dhis2_reporting <- dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", - " log_msg(glue::glue(\"🪮 Filtering DHIS2 reporting data to keep only values for REPORTING_RATE_PRODUCT_UID(s): {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')}.\n", - " Removed {nrow(dhis2_reporting) - nrow(dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID))} rows.\n", - " Dataframe dimensions after filtering: {paste(dim(dhis2_reporting), collapse=', ')}\"))\n", - "} else {\n", - " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data PRODUCT_UIDs: {paste(unique(dhis2_reporting$PRODUCT_UID), collapse=', ')}. \n", - " 🦘 Skipping filtering and keeping all data. Dataframe dimensions: {paste(dim(dhis2_reporting), collapse=', ')}\"), level = \"warning\")\n", - "}" - ] + "papermill": { + "duration": 0.590832, + "end_time": "2025-12-19T10:23:09.481144", + "exception": false, + "start_time": "2025-12-19T10:23:08.890312", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "4237408a", - "metadata": { - "papermill": { - "duration": 0.000133, - "end_time": "2025-12-19T10:23:08.409660", - "exception": false, - "start_time": "2025-12-19T10:23:08.409527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.2. Pivot wider" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Check if any OU_ID is present in more than one PRODUCT_UID\n", + "# and if so list them\n", + "ou_product_counts <- dhis2_reporting %>%\n", + " group_by(OU_ID, OU_NAME) %>%\n", + " mutate(PRODUCT_UID_count = n_distinct(PRODUCT_UID)) %>%\n", + " filter(PRODUCT_UID_count > 1) %>%\n", + " select(ADM1_NAME, ADM2_NAME, OU_ID, OU_NAME, PRODUCT_UID_count) %>%\n", + " distinct() \n", + "\n", + "ou_product_counts\n", + "\n", + "# Log msg: which OU_ID have multiple PRODUCT_UIDs\n", + "if (nrow(ou_product_counts) > 0) {\n", + " log_msg(glue::glue(\"🚨 Warning: The following OU_IDs are associated with multiple PRODUCT_UIDs in the DHIS2 reporting data:\n", + "{paste(apply(ou_product_counts, 1, function(row) paste0(' - ', row['OU_NAME'], ' (', row['OU_ID'], ')')), collapse='\\n')}\"), \n", + " level = \"warning\")\n", + "} else {\n", + " log_msg(\"All OU_IDs are associated with a single PRODUCT_UID in the DHIS2 reporting data.\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "805ed555", + "metadata": { + "papermill": { + "duration": 0.000139, + "end_time": "2025-12-19T10:23:09.481549", + "exception": false, + "start_time": "2025-12-19T10:23:09.481410", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c3b9a65", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:08.413415Z", - "iopub.status.busy": "2025-12-19T10:23:08.411805Z", - "iopub.status.idle": "2025-12-19T10:23:08.884793Z", - "shell.execute_reply": "2025-12-19T10:23:08.880916Z" - }, - "papermill": { - "duration": 0.479538, - "end_time": "2025-12-19T10:23:08.889341", - "exception": false, - "start_time": "2025-12-19T10:23:08.409803", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Pivot wider to have one column per PRODUCT_METRIC (which now indicates whether the VALUE is \"ACTUAL_REPORTS\" or \"EXPECTED_REPORTS\")\n", - "dhis2_reporting_wide <- dhis2_reporting %>%\n", - " pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", - "\n", - "# Log msg\n", - "log_msg(glue::glue(\"Pivoted DHIS2 reporting data to wide format, with one column per PRODUCT_METRIC (ACTUAL_REPORTS, EXPECTED_REPORTS).\n", - "Dimensions after pivot: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", - "\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "#### Remove duplicated OU_IDs (shared across PRODUCT_UIDs)\n", + "Logic: \n", + "1. Identify if any `OU_ID` is present in both datasets\n", + "2. For these, keep `max(ACTUAL_REPORTS)` (since `EXPECTED_REPORTS` is always == 1) because: \n", + " * if both same value (either both 0 or both 1) => simply deduplicate (`distinct()`)\n", + " * if else if different values, meaning that one dataset say 1 and the other 0 => keep 1 (facility _did_ report)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "593b013a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:09.488856Z", + "iopub.status.busy": "2025-12-19T10:23:09.484674Z", + "iopub.status.idle": "2025-12-19T10:23:13.563200Z", + "shell.execute_reply": "2025-12-19T10:23:13.559294Z" }, - { - "cell_type": "markdown", - "id": "0f485148", - "metadata": { - "papermill": { - "duration": 0.000186, - "end_time": "2025-12-19T10:23:08.889829", - "exception": false, - "start_time": "2025-12-19T10:23:08.889643", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 👯 Handle **duplicated** values (`OU_ID`)\n", - "Using multiple datasets relies on the **assumption** that **each dataset is complementary to the other(s)**. Namely, there should be no \"dupliacted\" orgunits that are counted in more than one dataset! Else, we would be **double counting**." - ] + "papermill": { + "duration": 4.086946, + "end_time": "2025-12-19T10:23:13.568699", + "exception": false, + "start_time": "2025-12-19T10:23:09.481753", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "55dececa", - "metadata": { - "papermill": { - "duration": 0.000122, - "end_time": "2025-12-19T10:23:08.890157", - "exception": false, - "start_time": "2025-12-19T10:23:08.890035", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Check for duplicated values (`OU_ID`)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 1: check for duplicated OU_ID by PERIOD (there should be only 1 value of OU_ID per PERIOD)\n", + "dupl_ou_period <- dhis2_reporting_wide %>%\n", + " group_by(OU_ID, PERIOD) %>%\n", + " filter(n() > 1) %>%\n", + " ungroup() %>%\n", + " select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, ends_with(\"REPORTS\"))\n", + "\n", + "# Log msg\n", + "if (nrow(dupl_ou_period) > 0) {\n", + " log_msg(glue::glue(\"🚨 Warning: The OU_IDs are associated with multiple PRODUCT_UIDs affect {nrow(dupl_ou_period)} PERIOD entries (rows) in the DHIS2 reporting data.\"))\n", + "}\n", + "\n", + "dim(dupl_ou_period)\n", + "head(dupl_ou_period, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c72bd93a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:13.581200Z", + "iopub.status.busy": "2025-12-19T10:23:13.574942Z", + "iopub.status.idle": "2025-12-19T10:23:18.911910Z", + "shell.execute_reply": "2025-12-19T10:23:18.907746Z" }, - { - "cell_type": "code", - "execution_count": null, - "id": "d761bd15", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:08.899486Z", - "iopub.status.busy": "2025-12-19T10:23:08.894706Z", - "iopub.status.idle": "2025-12-19T10:23:09.476248Z", - "shell.execute_reply": "2025-12-19T10:23:09.470283Z" - }, - "papermill": { - "duration": 0.590832, - "end_time": "2025-12-19T10:23:09.481144", - "exception": false, - "start_time": "2025-12-19T10:23:08.890312", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if any OU_ID is present in more than one PRODUCT_UID\n", - "# and if so list them\n", - "ou_product_counts <- dhis2_reporting %>%\n", - " group_by(OU_ID, OU_NAME) %>%\n", - " mutate(PRODUCT_UID_count = n_distinct(PRODUCT_UID)) %>%\n", - " filter(PRODUCT_UID_count > 1) %>%\n", - " select(ADM1_NAME, ADM2_NAME, OU_ID, OU_NAME, PRODUCT_UID_count) %>%\n", - " distinct() \n", - "\n", - "ou_product_counts\n", - "\n", - "# Log msg: which OU_ID have multiple PRODUCT_UIDs\n", - "if (nrow(ou_product_counts) > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: The following OU_IDs are associated with multiple PRODUCT_UIDs in the DHIS2 reporting data:\n", - "{paste(apply(ou_product_counts, 1, function(row) paste0(' - ', row['OU_NAME'], ' (', row['OU_ID'], ')')), collapse='\\n')}\"), \n", - " level = \"warning\")\n", - "} else {\n", - " log_msg(\"All OU_IDs are associated with a single PRODUCT_UID in the DHIS2 reporting data.\")\n", - "}" - ] + "papermill": { + "duration": 5.346749, + "end_time": "2025-12-19T10:23:18.915815", + "exception": false, + "start_time": "2025-12-19T10:23:13.569066", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "805ed555", - "metadata": { - "papermill": { - "duration": 0.000139, - "end_time": "2025-12-19T10:23:09.481549", - "exception": false, - "start_time": "2025-12-19T10:23:09.481410", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "#### Remove duplicated OU_IDs (shared across PRODUCT_UIDs)\n", - "Logic: \n", - "1. Identify if any `OU_ID` is present in both datasets\n", - "2. For these, keep `max(ACTUAL_REPORTS)` (since `EXPECTED_REPORTS` is always == 1) because: \n", - " * if both same value (either both 0 or both 1) => simply deduplicate (`distinct()`)\n", - " * if else if different values, meaning that one dataset say 1 and the other 0 => keep 1 (facility _did_ report)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 2: remove duplicated OU_ID by PERIOD\n", + "# Use the following logic:\n", + "# - 1. first, check that values (ACTUAL_REPORTS, EXPECTED_REPORTS) are all 0 or 1 (if not that needs to be handled differently, so skip for now)\n", + "# - 2. then, if multiple PRODUCT_UIDs exist for the same OU_ID and PERIOD, keep the one with the highest ACTUAL_REPORTS value\n", + "# (this is because if values agree, then we can simply keep one, if they don't agree, that means that we have 1 and 0 values, so we keep the 1)\n", + "\n", + "if (all(dupl_ou_period$ACTUAL_REPORTS %in% c(0,1)) & all(dupl_ou_period$EXPECTED_REPORTS %in% c(0,1))) {\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " group_by(PERIOD, OU_ID) %>%\n", + " mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", + " ungroup() %>%\n", + " filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", + " select(-ACTUAL_REPORTS_deduplicated)\n", + "\n", + " log_msg(glue::glue(\"✅ Deduplicated DHIS2 reporting data by keeping only one PRODUCT_UID per OU_ID and PERIOD, based on highest ACTUAL_REPORTS value.\n", + " Dataframe dimensions after deduplication: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", + "} else {\n", + " log_msg(\"🚨 Warning: Cannot deduplicate OU_ID by PERIOD in DHIS2 reporting data because ACTUAL_REPORTS or EXPECTED_REPORTS contain values other than 0 or 1. \n", + " Analysis will continue without removing duplicated entries.\", level = \"warning\")\n", + "} \n", + "\n", + "dim(dhis2_reporting_wide)\n", + "head(dhis2_reporting_wide, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "2f26c614", + "metadata": { + "papermill": { + "duration": 0.000236, + "end_time": "2025-12-19T10:23:18.916421", + "exception": false, + "start_time": "2025-12-19T10:23:18.916185", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "593b013a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:09.488856Z", - "iopub.status.busy": "2025-12-19T10:23:09.484674Z", - "iopub.status.idle": "2025-12-19T10:23:13.563200Z", - "shell.execute_reply": "2025-12-19T10:23:13.559294Z" - }, - "papermill": { - "duration": 4.086946, - "end_time": "2025-12-19T10:23:13.568699", - "exception": false, - "start_time": "2025-12-19T10:23:09.481753", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: check for duplicated OU_ID by PERIOD (there should be only 1 value of OU_ID per PERIOD)\n", - "dupl_ou_period <- dhis2_reporting_wide %>%\n", - " group_by(OU_ID, PERIOD) %>%\n", - " filter(n() > 1) %>%\n", - " ungroup() %>%\n", - " select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, ends_with(\"REPORTS\"))\n", - "\n", - "# Log msg\n", - "if (nrow(dupl_ou_period) > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: The OU_IDs are associated with multiple PRODUCT_UIDs affect {nrow(dupl_ou_period)} PERIOD entries (rows) in the DHIS2 reporting data.\"))\n", - "}\n", - "\n", - "dim(dupl_ou_period)\n", - "head(dupl_ou_period, 5)" - ] + "tags": [] + }, + "source": [ + "### 3.3. (🇳🇪 NER only) Make HOP aggregated values (0, >1) into presence/absence (0, 1)\n", + "Specific for Niger SNIS instance!
\n", + "Values for dataset HOP (\"ki7YKOfyxjf\" = \"HOP 03 ACTIVITES DE LUTTE CONTRE LE PALUDISME\") count the individual \"sub-units\" (departments, etc ... ) of a given hospital and therefore can have values >1.
\n", + "For consistency with CSI (where all values are raw, and therefore only 0 and 1), we need to convert all HOP value >1 into 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4118991c", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:18.924306Z", + "iopub.status.busy": "2025-12-19T10:23:18.920810Z", + "iopub.status.idle": "2025-12-19T10:23:19.482033Z", + "shell.execute_reply": "2025-12-19T10:23:19.479013Z" }, - { - "cell_type": "code", - "execution_count": null, - "id": "c72bd93a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:13.581200Z", - "iopub.status.busy": "2025-12-19T10:23:13.574942Z", - "iopub.status.idle": "2025-12-19T10:23:18.911910Z", - "shell.execute_reply": "2025-12-19T10:23:18.907746Z" - }, - "papermill": { - "duration": 5.346749, - "end_time": "2025-12-19T10:23:18.915815", - "exception": false, - "start_time": "2025-12-19T10:23:13.569066", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 2: remove duplicated OU_ID by PERIOD\n", - "# Use the following logic:\n", - "# - 1. first, check that values (ACTUAL_REPORTS, EXPECTED_REPORTS) are all 0 or 1 (if not that needs to be handled differently, so skip for now)\n", - "# - 2. then, if multiple PRODUCT_UIDs exist for the same OU_ID and PERIOD, keep the one with the highest ACTUAL_REPORTS value\n", - "# (this is because if values agree, then we can simply keep one, if they don't agree, that means that we have 1 and 0 values, so we keep the 1)\n", - "\n", - "if (all(dupl_ou_period$ACTUAL_REPORTS %in% c(0,1)) & all(dupl_ou_period$EXPECTED_REPORTS %in% c(0,1))) {\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " group_by(PERIOD, OU_ID) %>%\n", - " mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", - " ungroup() %>%\n", - " filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", - " select(-ACTUAL_REPORTS_deduplicated)\n", - "\n", - " log_msg(glue::glue(\"✅ Deduplicated DHIS2 reporting data by keeping only one PRODUCT_UID per OU_ID and PERIOD, based on highest ACTUAL_REPORTS value.\n", - " Dataframe dimensions after deduplication: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", - "} else {\n", - " log_msg(\"🚨 Warning: Cannot deduplicate OU_ID by PERIOD in DHIS2 reporting data because ACTUAL_REPORTS or EXPECTED_REPORTS contain values other than 0 or 1. \n", - " Analysis will continue without removing duplicated entries.\", level = \"warning\")\n", - "} \n", - "\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)" - ] + "papermill": { + "duration": 0.56938, + "end_time": "2025-12-19T10:23:19.486133", + "exception": false, + "start_time": "2025-12-19T10:23:18.916753", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "2f26c614", - "metadata": { - "papermill": { - "duration": 0.000236, - "end_time": "2025-12-19T10:23:18.916421", - "exception": false, - "start_time": "2025-12-19T10:23:18.916185", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.3. (🇳🇪 NER only) Make HOP aggregated values (0, >1) into presence/absence (0, 1)\n", - "Specific for Niger SNIS instance!
\n", - "Values for dataset HOP (\"ki7YKOfyxjf\" = \"HOP 03 ACTIVITES DE LUTTE CONTRE LE PALUDISME\") count the individual \"sub-units\" (departments, etc ... ) of a given hospital and therefore can have values >1.
\n", - "For consistency with CSI (where all values are raw, and therefore only 0 and 1), we need to convert all HOP value >1 into 1." - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Modify dhis2_reporting_wide to replace all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1\n", + "if (COUNTRY_CODE == \"NER\") {\n", + " log_msg(\"🇳🇪 Special handling for NER: replacing all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1.\")\n", + "\n", + " # Check if any values >1 exist\n", + " n_actual_reports_gt1 <- sum(dhis2_reporting_wide$ACTUAL_REPORTS > 1, na.rm = TRUE)\n", + " n_expected_reports_gt1 <- sum(dhis2_reporting_wide$EXPECTED_REPORTS > 1, na.rm = TRUE)\n", + "\n", + " # Extract the PRODUCT_UID and PRODUCT_NAME associated with those values\n", + " if (n_actual_reports_gt1 > 0 | n_expected_reports_gt1 > 0) {\n", + " dupl_actual_reports <- dhis2_reporting_wide %>%\n", + " filter(ACTUAL_REPORTS > 1) %>%\n", + " select(PRODUCT_UID, PRODUCT_NAME) %>%\n", + " distinct()\n", + "\n", + " log_msg(glue::glue(\"Note: Found {n_actual_reports_gt1} entries with ACTUAL_REPORTS > 1 and {n_expected_reports_gt1} entries with EXPECTED_REPORTS > 1.\n", + "Affected PRODUCT_UIDs and PRODUCT_NAMEs for ACTUAL_REPORTS > 1:\n", + "{paste(apply(dupl_actual_reports, 1, function(row) paste0(row['PRODUCT_NAME'], ' (', row['PRODUCT_UID'], ')')), collapse='\\n')}\"))\n", + "\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " mutate(\n", + " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", + " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", + " )\n", + "\n", + " log_msg(\"✅ Replaced all values of ACTUAL_REPORTS and EXPECTED_REPORTS that were >1 with 1.\")\n", + "\n", + "} # else nothing to replace\n", + "\n", + " dim(dhis2_reporting_wide)\n", + " head(dhis2_reporting_wide, 3)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "066319a3", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2025-12-19T10:23:19.486674", + "exception": false, + "start_time": "2025-12-19T10:23:19.486502", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "4118991c", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:18.924306Z", - "iopub.status.busy": "2025-12-19T10:23:18.920810Z", - "iopub.status.idle": "2025-12-19T10:23:19.482033Z", - "shell.execute_reply": "2025-12-19T10:23:19.479013Z" - }, - "papermill": { - "duration": 0.56938, - "end_time": "2025-12-19T10:23:19.486133", - "exception": false, - "start_time": "2025-12-19T10:23:18.916753", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Modify dhis2_reporting_wide to replace all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " log_msg(\"🇳🇪 Special handling for NER: replacing all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1.\")\n", - "\n", - " # Check if any values >1 exist\n", - " n_actual_reports_gt1 <- sum(dhis2_reporting_wide$ACTUAL_REPORTS > 1, na.rm = TRUE)\n", - " n_expected_reports_gt1 <- sum(dhis2_reporting_wide$EXPECTED_REPORTS > 1, na.rm = TRUE)\n", - "\n", - " # Extract the PRODUCT_UID and PRODUCT_NAME associated with those values\n", - " if (n_actual_reports_gt1 > 0 | n_expected_reports_gt1 > 0) {\n", - " dupl_actual_reports <- dhis2_reporting_wide %>%\n", - " filter(ACTUAL_REPORTS > 1) %>%\n", - " select(PRODUCT_UID, PRODUCT_NAME) %>%\n", - " distinct()\n", - "\n", - " log_msg(glue::glue(\"Note: Found {n_actual_reports_gt1} entries with ACTUAL_REPORTS > 1 and {n_expected_reports_gt1} entries with EXPECTED_REPORTS > 1.\n", - "Affected PRODUCT_UIDs and PRODUCT_NAMEs for ACTUAL_REPORTS > 1:\n", - "{paste(apply(dupl_actual_reports, 1, function(row) paste0(row['PRODUCT_NAME'], ' (', row['PRODUCT_UID'], ')')), collapse='\\n')}\"))\n", - "\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " mutate(\n", - " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", - " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", - " )\n", - "\n", - " log_msg(\"✅ Replaced all values of ACTUAL_REPORTS and EXPECTED_REPORTS that were >1 with 1.\")\n", - "\n", - "} # else nothing to replace\n", - "\n", - " dim(dhis2_reporting_wide)\n", - " head(dhis2_reporting_wide, 3)\n", - "}" - ] + "tags": [] + }, + "source": [ + "### 3.4. Aggregate at AMD2 level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e94eeddd", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:19.494212Z", + "iopub.status.busy": "2025-12-19T10:23:19.491141Z", + "iopub.status.idle": "2025-12-19T10:23:19.791631Z", + "shell.execute_reply": "2025-12-19T10:23:19.786378Z" }, - { - "cell_type": "markdown", - "id": "066319a3", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2025-12-19T10:23:19.486674", - "exception": false, - "start_time": "2025-12-19T10:23:19.486502", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.4. Aggregate at AMD2 level" - ] + "papermill": { + "duration": 0.308903, + "end_time": "2025-12-19T10:23:19.795888", + "exception": false, + "start_time": "2025-12-19T10:23:19.486985", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "e94eeddd", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:19.494212Z", - "iopub.status.busy": "2025-12-19T10:23:19.491141Z", - "iopub.status.idle": "2025-12-19T10:23:19.791631Z", - "shell.execute_reply": "2025-12-19T10:23:19.786378Z" - }, - "papermill": { - "duration": 0.308903, - "end_time": "2025-12-19T10:23:19.795888", - "exception": false, - "start_time": "2025-12-19T10:23:19.486985", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Sum up values (now at acility level) to get totals per ADM2_ID and PERIOD\n", - "dhis2_reporting_wide_adm2 <- dhis2_reporting_wide %>%\n", - " group_by(\n", - " PERIOD, \n", - " YEAR, MONTH, # keep these just for sanity check (not needed for grouping)\n", - " ADM1_NAME, ADM1_ID, # keep these just for sanity check (not needed for grouping)\n", - " ADM2_NAME, ADM2_ID\n", - " ) %>%\n", - " summarise(\n", - " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", - " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", - " .groups = 'drop'\n", - " ) \n", - "\n", - "# Add log messages\n", - "log_msg(glue::glue(\"DHIS2 reporting data pivoted to wide format and aggregated at ADM2 level. \n", - "Dataframe dimensions: {paste(dim(dhis2_reporting_wide_adm2), collapse=', ')}\"))\n", - "head(dhis2_reporting_wide_adm2, 3)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Sum up values (now at acility level) to get totals per ADM2_ID and PERIOD\n", + "dhis2_reporting_wide_adm2 <- dhis2_reporting_wide %>%\n", + " group_by(\n", + " PERIOD, \n", + " YEAR, MONTH, # keep these just for sanity check (not needed for grouping)\n", + " ADM1_NAME, ADM1_ID, # keep these just for sanity check (not needed for grouping)\n", + " ADM2_NAME, ADM2_ID\n", + " ) %>%\n", + " summarise(\n", + " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", + " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", + " .groups = 'drop'\n", + " ) \n", + "\n", + "# Add log messages\n", + "log_msg(glue::glue(\"DHIS2 reporting data pivoted to wide format and aggregated at ADM2 level. \n", + "Dataframe dimensions: {paste(dim(dhis2_reporting_wide_adm2), collapse=', ')}\"))\n", + "head(dhis2_reporting_wide_adm2, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "eb181891", + "metadata": { + "papermill": { + "duration": 0.000151, + "end_time": "2025-12-19T10:23:19.796350", + "exception": false, + "start_time": "2025-12-19T10:23:19.796199", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "eb181891", - "metadata": { - "papermill": { - "duration": 0.000151, - "end_time": "2025-12-19T10:23:19.796350", - "exception": false, - "start_time": "2025-12-19T10:23:19.796199", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.5. Calculate REPORTING_RATE\n", - "**numerator**: `ACTUAL_REPORTS`
\n", - "**denominator**: `EXPECTED_REPORTS`" - ] + "tags": [] + }, + "source": [ + "### 3.5. Calculate REPORTING_RATE\n", + "**numerator**: `ACTUAL_REPORTS`
\n", + "**denominator**: `EXPECTED_REPORTS`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a1c20", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:19.803233Z", + "iopub.status.busy": "2025-12-19T10:23:19.799996Z", + "iopub.status.idle": "2025-12-19T10:23:19.994060Z", + "shell.execute_reply": "2025-12-19T10:23:19.991575Z" }, - { - "cell_type": "code", - "execution_count": null, - "id": "e90a1c20", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:19.803233Z", - "iopub.status.busy": "2025-12-19T10:23:19.799996Z", - "iopub.status.idle": "2025-12-19T10:23:19.994060Z", - "shell.execute_reply": "2025-12-19T10:23:19.991575Z" - }, - "papermill": { - "duration": 0.200465, - "end_time": "2025-12-19T10:23:19.997024", - "exception": false, - "start_time": "2025-12-19T10:23:19.796559", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Calculate REPORTING_RATE as ACTUAL_REPORTS / EXPECTED_REPORTS\n", - "reporting_rate_results <- dhis2_reporting_wide_adm2 %>%\n", - " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", - "\n", - "log_msg(glue::glue(\"DHIS2 reporting rate calculated as ACTUAL_REPORTS / EXPECTED_REPORTS. Dataframe dimensions: {paste(dim(reporting_rate_results), collapse=', ')}\"))\n", - "head(reporting_rate_results, 3) " - ] + "papermill": { + "duration": 0.200465, + "end_time": "2025-12-19T10:23:19.997024", + "exception": false, + "start_time": "2025-12-19T10:23:19.796559", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99", - "metadata": { - "papermill": { - "duration": 0.000123, - "end_time": "2025-12-19T10:23:19.997465", - "exception": false, - "start_time": "2025-12-19T10:23:19.997342", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.6. Ensure consistency of table (probably can skip because all data comes from the same source!)\n", - "Left join reporting indicators with DHIS2 routine data.\n", - "Make sure we have a consistent reporting rates table matching periods x org units (safety measure only)." - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate REPORTING_RATE as ACTUAL_REPORTS / EXPECTED_REPORTS\n", + "reporting_rate_results <- dhis2_reporting_wide_adm2 %>%\n", + " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", + "\n", + "log_msg(glue::glue(\"DHIS2 reporting rate calculated as ACTUAL_REPORTS / EXPECTED_REPORTS. Dataframe dimensions: {paste(dim(reporting_rate_results), collapse=', ')}\"))\n", + "head(reporting_rate_results, 3) " + ] + }, + { + "cell_type": "markdown", + "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99", + "metadata": { + "papermill": { + "duration": 0.000123, + "end_time": "2025-12-19T10:23:19.997465", + "exception": false, + "start_time": "2025-12-19T10:23:19.997342", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:20.001909Z", - "iopub.status.busy": "2025-12-19T10:23:19.999878Z", - "iopub.status.idle": "2025-12-19T10:23:20.072344Z", - "shell.execute_reply": "2025-12-19T10:23:20.070004Z" - }, - "papermill": { - "duration": 0.077426, - "end_time": "2025-12-19T10:23:20.075077", - "exception": false, - "start_time": "2025-12-19T10:23:19.997651", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "reporting_rate_dataset <- left_join(dhis2_routine, \n", - " reporting_rate_results %>% select(all_of(fixed_cols_rr)), \n", - " by=c(\"YEAR\", \"MONTH\", \"ADM2_ID\"))\n", - "\n", - "print(dim(reporting_rate_dataset))\n", - "head(reporting_rate_dataset, 3)" - ] + "tags": [] + }, + "source": [ + "### 3.6. Ensure consistency of table (probably can skip because all data comes from the same source!)\n", + "Left join reporting indicators with DHIS2 routine data.\n", + "Make sure we have a consistent reporting rates table matching periods x org units (safety measure only)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:20.001909Z", + "iopub.status.busy": "2025-12-19T10:23:19.999878Z", + "iopub.status.idle": "2025-12-19T10:23:20.072344Z", + "shell.execute_reply": "2025-12-19T10:23:20.070004Z" }, - { - "cell_type": "markdown", - "id": "6b19e88d", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2025-12-19T10:23:20.075561", - "exception": false, - "start_time": "2025-12-19T10:23:20.075388", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.7. Final visual check on REPORTING_RATE values" - ] + "papermill": { + "duration": 0.077426, + "end_time": "2025-12-19T10:23:20.075077", + "exception": false, + "start_time": "2025-12-19T10:23:19.997651", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "fbfec60f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Add log message to communicate range of REPORTING_RATE values and warn if any values are outside [0,1]\n", - "min_rr <- min(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", - "max_rr <- max(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", - "if (min_rr < 0 | max_rr > 1) { \n", - " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE values are outside the expected range [0,1]. \n", - " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"), level = \"warning\")\n", - "} else {\n", - " log_msg(glue::glue(\"✅ REPORTING_RATE values are within the expected range [0,1]. \n", - " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"))\n", - "}" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "reporting_rate_dataset <- left_join(dhis2_routine, \n", + " reporting_rate_results %>% select(all_of(fixed_cols_rr)), \n", + " by=c(\"YEAR\", \"MONTH\", \"ADM2_ID\"))\n", + "\n", + "print(dim(reporting_rate_dataset))\n", + "head(reporting_rate_dataset, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "6b19e88d", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2025-12-19T10:23:20.075561", + "exception": false, + "start_time": "2025-12-19T10:23:20.075388", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "8878192f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:20.080475Z", - "iopub.status.busy": "2025-12-19T10:23:20.078272Z", - "iopub.status.idle": "2025-12-19T10:23:21.456898Z", - "shell.execute_reply": "2025-12-19T10:23:21.453352Z" - }, - "papermill": { - "duration": 1.384875, - "end_time": "2025-12-19T10:23:21.460674", - "exception": false, - "start_time": "2025-12-19T10:23:20.075799", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Simple plot to visualize distribution of REPORTING_RATE\n", - "ggplot(reporting_rate_dataset, aes(x=REPORTING_RATE)) +\n", - " geom_histogram() +\n", - " labs(\n", - " x=\"Dataset Reporting Rate\", y=\"Frequency\",\n", - " title = glue::glue(\"Reporting rate values range from {round(min(reporting_rate_dataset$REPORTING_RATE), 2)} to {round(max(reporting_rate_dataset$REPORTING_RATE), 2)}\")\n", - " ) +\n", - " theme_minimal()" - ] + "tags": [] + }, + "source": [ + "### 3.7. Final visual check on REPORTING_RATE values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbfec60f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add log message to communicate range of REPORTING_RATE values and warn if any values are outside [0,1]\n", + "min_rr <- min(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", + "max_rr <- max(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", + "if (min_rr < 0 | max_rr > 1) { \n", + " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE values are outside the expected range [0,1]. \n", + " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"), level = \"warning\")\n", + "} else {\n", + " log_msg(glue::glue(\"✅ REPORTING_RATE values are within the expected range [0,1]. \n", + " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8878192f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:20.080475Z", + "iopub.status.busy": "2025-12-19T10:23:20.078272Z", + "iopub.status.idle": "2025-12-19T10:23:21.456898Z", + "shell.execute_reply": "2025-12-19T10:23:21.453352Z" }, - { - "cell_type": "markdown", - "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60", - "metadata": { - "papermill": { - "duration": 0.000104, - "end_time": "2025-12-19T10:23:21.460981", - "exception": false, - "start_time": "2025-12-19T10:23:21.460877", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 4. 📁 Export to `data/` folder\n", - "Export as both .csv and .parquet file formats." - ] + "papermill": { + "duration": 1.384875, + "end_time": "2025-12-19T10:23:21.460674", + "exception": false, + "start_time": "2025-12-19T10:23:20.075799", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "9adc033d-18d6-4786-8f96-21337b3e005f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:21.467337Z", - "iopub.status.busy": "2025-12-19T10:23:21.464010Z", - "iopub.status.idle": "2025-12-19T10:23:22.383295Z", - "shell.execute_reply": "2025-12-19T10:23:22.379935Z" - }, - "papermill": { - "duration": 0.926094, - "end_time": "2025-12-19T10:23:22.387190", - "exception": false, - "start_time": "2025-12-19T10:23:21.461096", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", - "\n", - "# parquet\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")) \n", - "write_parquet(reporting_rate_dataset, file_path)\n", - "log_msg(glue(\"Exported : {file_path}\"))\n", - "\n", - "# csv\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\"))\n", - "write.csv(reporting_rate_dataset, file_path, row.names = FALSE)\n", - "log_msg(glue(\"Exported : {file_path}\"))" - ] + "tags": [], + "vscode": { + "languageId": "r" } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + }, + "outputs": [], + "source": [ + "# Simple plot to visualize distribution of REPORTING_RATE\n", + "ggplot(reporting_rate_dataset, aes(x=REPORTING_RATE)) +\n", + " geom_histogram() +\n", + " labs(\n", + " x=\"Dataset Reporting Rate\", y=\"Frequency\",\n", + " title = glue::glue(\"Reporting rate values range from {round(min(reporting_rate_dataset$REPORTING_RATE), 2)} to {round(max(reporting_rate_dataset$REPORTING_RATE), 2)}\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "markdown", + "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60", + "metadata": { + "papermill": { + "duration": 0.000104, + "end_time": "2025-12-19T10:23:21.460981", + "exception": false, + "start_time": "2025-12-19T10:23:21.460877", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" + "tags": [] + }, + "source": [ + "## 4. 📁 Export to `data/` folder\n", + "Export as both .csv and .parquet file formats." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9adc033d-18d6-4786-8f96-21337b3e005f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:21.467337Z", + "iopub.status.busy": "2025-12-19T10:23:21.464010Z", + "iopub.status.idle": "2025-12-19T10:23:22.383295Z", + "shell.execute_reply": "2025-12-19T10:23:22.379935Z" }, "papermill": { - "default_parameters": {}, - "duration": 94.192072, - "end_time": "2025-12-19T10:23:22.614345", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/papermill_outputs/snt_dhis2_reporting_rate_dataset_OUTPUT_2025-12-19_102148.ipynb", - "parameters": { - "ROUTINE_FILE": "NER_routine_outliers-mean_imputed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace" - }, - "start_time": "2025-12-19T10:21:48.422273", - "version": "2.6.0" + "duration": 0.926094, + "end_time": "2025-12-19T10:23:22.387190", + "exception": false, + "start_time": "2025-12-19T10:23:21.461096", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", + "\n", + "# parquet\n", + "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")) \n", + "write_parquet(reporting_rate_dataset, file_path)\n", + "log_msg(glue(\"Exported : {file_path}\"))\n", + "\n", + "# csv\n", + "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\"))\n", + "write.csv(reporting_rate_dataset, file_path, row.names = FALSE)\n", + "log_msg(glue(\"Exported : {file_path}\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, - "nbformat": 4, - "nbformat_minor": 5 + "papermill": { + "default_parameters": {}, + "duration": 94.192072, + "end_time": "2025-12-19T10:23:22.614345", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/papermill_outputs/snt_dhis2_reporting_rate_dataset_OUTPUT_2025-12-19_102148.ipynb", + "parameters": { + "ROUTINE_FILE": "NER_routine_outliers_imputed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace" + }, + "start_time": "2025-12-19T10:21:48.422273", + "version": "2.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb index 884b286..38091fe 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb @@ -1,1294 +1,1300 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "b79cba06", - "metadata": { - "papermill": { - "duration": 0.000249, - "end_time": "2025-12-19T10:23:27.548651", - "exception": false, - "start_time": "2025-12-19T10:23:27.548402", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ca65bcc", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:27.561213Z", - "iopub.status.busy": "2025-12-19T10:23:27.553197Z", - "iopub.status.idle": "2025-12-19T10:23:34.811467Z", - "shell.execute_reply": "2025-12-19T10:23:34.808478Z" - }, - "papermill": { - "duration": 7.265364, - "end_time": "2025-12-19T10:23:34.814448", - "exception": false, - "start_time": "2025-12-19T10:23:27.549084", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "# Load palettes\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", - "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "c5301aa3", - "metadata": { - "papermill": { - "duration": 0.000116, - "end_time": "2025-12-19T10:23:34.814852", - "exception": false, - "start_time": "2025-12-19T10:23:34.814736", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.1. Load and check `snt config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76d8a072", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:34.858197Z", - "iopub.status.busy": "2025-12-19T10:23:34.817039Z", - "iopub.status.idle": "2025-12-19T10:23:35.335737Z", - "shell.execute_reply": "2025-12-19T10:23:35.333547Z" - }, - "papermill": { - "duration": 0.52329, - "end_time": "2025-12-19T10:23:35.338288", - "exception": false, - "start_time": "2025-12-19T10:23:34.814998", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c712ac02", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:35.342494Z", - "iopub.status.busy": "2025-12-19T10:23:35.340803Z", - "iopub.status.idle": "2025-12-19T10:23:35.366376Z", - "shell.execute_reply": "2025-12-19T10:23:35.364165Z" - }, - "papermill": { - "duration": 0.030446, - "end_time": "2025-12-19T10:23:35.368977", - "exception": false, - "start_time": "2025-12-19T10:23:35.338531", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "REPORTING_RATE_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "DHIS2_FORMATTED_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "REPORTING_RATE_PRODUCT_UID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID # to add to plots subtitles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e02c652e", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:35.373316Z", - "iopub.status.busy": "2025-12-19T10:23:35.371377Z", - "iopub.status.idle": "2025-12-19T10:23:35.396646Z", - "shell.execute_reply": "2025-12-19T10:23:35.394442Z" - }, - "papermill": { - "duration": 0.029675, - "end_time": "2025-12-19T10:23:35.398945", - "exception": false, - "start_time": "2025-12-19T10:23:35.369270", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "b79cba06", + "metadata": { + "papermill": { + "duration": 0.000249, + "end_time": "2025-12-19T10:23:27.548651", + "exception": false, + "start_time": "2025-12-19T10:23:27.548402", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ca65bcc", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:27.561213Z", + "iopub.status.busy": "2025-12-19T10:23:27.553197Z", + "iopub.status.idle": "2025-12-19T10:23:34.811467Z", + "shell.execute_reply": "2025-12-19T10:23:34.808478Z" + }, + "papermill": { + "duration": 7.265364, + "end_time": "2025-12-19T10:23:34.814448", + "exception": false, + "start_time": "2025-12-19T10:23:27.549084", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Project paths\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Load palettes\n", + "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", + "\n", + "# Load libraries \n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Environment variables\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load OpenHEXA sdk\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "markdown", + "id": "c5301aa3", + "metadata": { + "papermill": { + "duration": 0.000116, + "end_time": "2025-12-19T10:23:34.814852", + "exception": false, + "start_time": "2025-12-19T10:23:34.814736", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.1. Load and check `snt config` file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76d8a072", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:34.858197Z", + "iopub.status.busy": "2025-12-19T10:23:34.817039Z", + "iopub.status.idle": "2025-12-19T10:23:35.335737Z", + "shell.execute_reply": "2025-12-19T10:23:35.333547Z" + }, + "papermill": { + "duration": 0.52329, + "end_time": "2025-12-19T10:23:35.338288", + "exception": false, + "start_time": "2025-12-19T10:23:34.814998", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c712ac02", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:35.342494Z", + "iopub.status.busy": "2025-12-19T10:23:35.340803Z", + "iopub.status.idle": "2025-12-19T10:23:35.366376Z", + "shell.execute_reply": "2025-12-19T10:23:35.364165Z" + }, + "papermill": { + "duration": 0.030446, + "end_time": "2025-12-19T10:23:35.368977", + "exception": false, + "start_time": "2025-12-19T10:23:35.338531", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "REPORTING_RATE_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", + "DHIS2_FORMATTED_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "REPORTING_RATE_PRODUCT_UID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID # to add to plots subtitles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e02c652e", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:35.373316Z", + "iopub.status.busy": "2025-12-19T10:23:35.371377Z", + "iopub.status.idle": "2025-12-19T10:23:35.396646Z", + "shell.execute_reply": "2025-12-19T10:23:35.394442Z" + }, + "papermill": { + "duration": 0.029675, + "end_time": "2025-12-19T10:23:35.398945", + "exception": false, + "start_time": "2025-12-19T10:23:35.369270", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Make string of product uids for plot subtitles\n", + "rr_product_uid <-paste(REPORTING_RATE_PRODUCT_UID,collapse = \", \") \n", + "rr_product_uid" + ] + }, + { + "cell_type": "markdown", + "id": "30b058f4", + "metadata": { + "papermill": { + "duration": 0.000094, + "end_time": "2025-12-19T10:23:35.399231", + "exception": false, + "start_time": "2025-12-19T10:23:35.399137", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.2. Load and check `snt metadata` file\n", + "This is needed for the correct use of palettes and categories (breaks, or scale)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98a8ee49", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:35.403224Z", + "iopub.status.busy": "2025-12-19T10:23:35.401458Z", + "iopub.status.idle": "2025-12-19T10:23:36.335964Z", + "shell.execute_reply": "2025-12-19T10:23:36.330643Z" + }, + "papermill": { + "duration": 0.940593, + "end_time": "2025-12-19T10:23:36.339927", + "exception": false, + "start_time": "2025-12-19T10:23:35.399334", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT metadata\n", + "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading metadata\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00681217", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:36.357945Z", + "iopub.status.busy": "2025-12-19T10:23:36.343228Z", + "iopub.status.idle": "2025-12-19T10:23:36.535579Z", + "shell.execute_reply": "2025-12-19T10:23:36.533231Z" + }, + "papermill": { + "duration": 0.198107, + "end_time": "2025-12-19T10:23:36.538224", + "exception": false, + "start_time": "2025-12-19T10:23:36.340117", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "scale_raw <- metadata_json$REPORTING_RATE$SCALE\n", + "if (is.character(scale_raw) && length(scale_raw) == 1) {\n", + " break_vals <- jsonlite::fromJSON(scale_raw)\n", + "} else {\n", + " break_vals <- unlist(scale_raw, use.names = FALSE)\n", + "}\n", + "break_vals <- as.numeric(break_vals)\n", + "\n", + "log_msg(paste0(\"Reporting Rate scale break values loaded from SNT_metadata.json : \", paste(break_vals, collapse = \", \")))" + ] + }, + { + "cell_type": "markdown", + "id": "f3470564", + "metadata": { + "papermill": { + "duration": 0.000162, + "end_time": "2025-12-19T10:23:36.538638", + "exception": false, + "start_time": "2025-12-19T10:23:36.538476", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "82397307", + "metadata": { + "papermill": { + "duration": 0.000126, + "end_time": "2025-12-19T10:23:36.538947", + "exception": false, + "start_time": "2025-12-19T10:23:36.538821", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 2.1. Output of pipeline notebook\n", + "Import file named `{COUNTRY_CODE}_reporting_rate_dataset.parquet` from **OH Dataset** \"SNT_DHIS2_REPORTING_RATE\" (as in `config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70acb2c5", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:36.543564Z", + "iopub.status.busy": "2025-12-19T10:23:36.541311Z", + "iopub.status.idle": "2025-12-19T10:23:37.788619Z", + "shell.execute_reply": "2025-12-19T10:23:37.785121Z" + }, + "papermill": { + "duration": 1.253125, + "end_time": "2025-12-19T10:23:37.792249", + "exception": false, + "start_time": "2025-12-19T10:23:36.539124", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "\n", + "reporting_rate_dataset <- tryCatch({ get_latest_dataset_file_in_memory(REPORTING_RATE_DATASET_NAME, glue::glue(\"{COUNTRY_CODE}_reporting_rate_dataset.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading Reporting Rate (Dataset) data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "# log\n", + "log_msg(glue::glue(\"Data file loaded from dataset : {REPORTING_RATE_DATASET_NAME} dataframe dimensions: {paste(dim(reporting_rate_dataset), collapse=', ')}\"))\n", + "dim(reporting_rate_dataset)\n", + "head(reporting_rate_dataset, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "48833515", + "metadata": { + "papermill": { + "duration": 0.000091, + "end_time": "2025-12-19T10:23:37.792528", + "exception": false, + "start_time": "2025-12-19T10:23:37.792437", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 2.2. Shapes\n", + "To make choropleth (map)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3febd4f4", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:37.798194Z", + "iopub.status.busy": "2025-12-19T10:23:37.795402Z", + "iopub.status.idle": "2025-12-19T10:23:41.325848Z", + "shell.execute_reply": "2025-12-19T10:23:41.323895Z" + }, + "papermill": { + "duration": 3.535554, + "end_time": "2025-12-19T10:23:41.328226", + "exception": false, + "start_time": "2025-12-19T10:23:37.792672", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "shapes <- tryCatch({ get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET_NAME, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) { \n", + " msg <- paste0(COUNTRY_CODE , \" Shapes data is not available in dataset: '\" , DHIS2_FORMATTED_DATASET_NAME, \"' last version.\")\n", + " log_msg(msg, \"warning\")\n", + " shapes <- NULL\n", + " })\n", + "\n", + "log_msg(glue::glue(\"Shapes loaded from dataset: '{DHIS2_FORMATTED_DATASET_NAME}'. \\nDataframe with dimensions: {paste(dim(shapes), collapse=', ')}\"))\n", + "names(shapes)" + ] + }, + { + "cell_type": "markdown", + "id": "17067d56", + "metadata": { + "papermill": { + "duration": 0.000166, + "end_time": "2025-12-19T10:23:41.328651", + "exception": false, + "start_time": "2025-12-19T10:23:41.328485", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3. Plots" + ] + }, + { + "cell_type": "markdown", + "id": "9a6369ee", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2025-12-19T10:23:41.328959", + "exception": false, + "start_time": "2025-12-19T10:23:41.328850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.0. Add shapes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6641720", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.333105Z", + "iopub.status.busy": "2025-12-19T10:23:41.331427Z", + "iopub.status.idle": "2025-12-19T10:23:41.365417Z", + "shell.execute_reply": "2025-12-19T10:23:41.363294Z" + }, + "papermill": { + "duration": 0.03905, + "end_time": "2025-12-19T10:23:41.368213", + "exception": false, + "start_time": "2025-12-19T10:23:41.329163", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Join shapes to reporting rate data\n", + "\n", + "data_to_plot <- reporting_rate_dataset %>%\n", + " left_join(shapes, by = c(\"ADM2_ID\"))" + ] + }, + { + "cell_type": "markdown", + "id": "0b0d32f1", + "metadata": { + "papermill": { + "duration": 0.000195, + "end_time": "2025-12-19T10:23:41.368739", + "exception": false, + "start_time": "2025-12-19T10:23:41.368544", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.1. 🎨 Dynamic categories and color assignement" + ] + }, + { + "cell_type": "markdown", + "id": "cc765e0c", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2025-12-19T10:23:41.369057", + "exception": false, + "start_time": "2025-12-19T10:23:41.368948", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 1. Define breaks and labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e79132c", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.373558Z", + "iopub.status.busy": "2025-12-19T10:23:41.371555Z", + "iopub.status.idle": "2025-12-19T10:23:41.392950Z", + "shell.execute_reply": "2025-12-19T10:23:41.390333Z" + }, + "papermill": { + "duration": 0.026996, + "end_time": "2025-12-19T10:23:41.396238", + "exception": false, + "start_time": "2025-12-19T10:23:41.369242", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Safety code to avoid breaking if nothings is fund in json_metadata\n", + "if (is.null(break_vals) || length(break_vals) == 0) {\n", + " log_msg(\"[WARNING] No break values found in SNT_metadata.json for REPORTING_RATE$SCALE. Using default values.\", \"warning\")\n", + " break_vals <- c(0.5, 0.8, 0.9, 0.95, 1.00)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f04cb888", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.401034Z", + "iopub.status.busy": "2025-12-19T10:23:41.398849Z", + "iopub.status.idle": "2025-12-19T10:23:41.430720Z", + "shell.execute_reply": "2025-12-19T10:23:41.428238Z" + }, + "papermill": { + "duration": 0.037712, + "end_time": "2025-12-19T10:23:41.434131", + "exception": false, + "start_time": "2025-12-19T10:23:41.396419", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 1. Define breaks\n", + "# Note: assumes that the data starts at 0!\n", + "# break_vals <- metadata_json$REPORTING_RATE$SCALE # moved upstream\n", + "\n", + "# 2. Create the full set of cut points (0 to Infinity)\n", + "full_breaks <- c(0, break_vals, Inf)\n", + "\n", + "# 3. Create dynamic labels\n", + "labels <- c(\n", + " paste0(\"< \", break_vals[1]), # First label\n", + " paste0(break_vals[-length(break_vals)], \" - \", break_vals[-1]), # Middle labels\n", + " paste0(\"> \", break_vals[length(break_vals)]) # Last label\n", + ")\n", + "\n", + "# Check\n", + "labels" + ] + }, + { + "cell_type": "markdown", + "id": "cb237801", + "metadata": { + "papermill": { + "duration": 0.000102, + "end_time": "2025-12-19T10:23:41.434442", + "exception": false, + "start_time": "2025-12-19T10:23:41.434340", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 2. Create `_CATEGORY` col" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8303488", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.439376Z", + "iopub.status.busy": "2025-12-19T10:23:41.437165Z", + "iopub.status.idle": "2025-12-19T10:23:41.471891Z", + "shell.execute_reply": "2025-12-19T10:23:41.469251Z" + }, + "papermill": { + "duration": 0.040632, + "end_time": "2025-12-19T10:23:41.475176", + "exception": false, + "start_time": "2025-12-19T10:23:41.434544", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# reporting_rate_dataset <- reporting_rate_dataset %>%\n", + "data_to_plot <- data_to_plot %>%\n", + " mutate(\n", + " REPORTING_RATE_CATEGORY = cut(\n", + " REPORTING_RATE,\n", + " breaks = full_breaks,\n", + " labels = labels,\n", + " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", + " include.lowest = TRUE\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "a10237f8", + "metadata": { + "papermill": { + "duration": 0.000102, + "end_time": "2025-12-19T10:23:41.475483", + "exception": false, + "start_time": "2025-12-19T10:23:41.475381", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3. Pick appropriate palette" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ee6e077", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.480216Z", + "iopub.status.busy": "2025-12-19T10:23:41.478061Z", + "iopub.status.idle": "2025-12-19T10:23:41.513805Z", + "shell.execute_reply": "2025-12-19T10:23:41.511268Z" + }, + "papermill": { + "duration": 0.04138, + "end_time": "2025-12-19T10:23:41.516984", + "exception": false, + "start_time": "2025-12-19T10:23:41.475604", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Count nr of breaks\n", + "nr_of_colors <- length(labels)\n", + "\n", + "# nr_of_colors\n", + "palette_to_use <- get_range_from_count(nr_of_colors)\n", + "\n", + "# Need to make palettes as named vectors so that scale_color_manual() and scale_fill_manual() can use them properly\n", + "# Note: need to reverse order of labels to match the palette order \"meaning\" (red \"\" should correcpond to lowest value)\n", + "names(palette_to_use) <- rev(labels)\n", + "\n", + "palette_to_use\n" + ] + }, + { + "cell_type": "markdown", + "id": "d08c0c14", + "metadata": { + "papermill": { + "duration": 0.000099, + "end_time": "2025-12-19T10:23:41.517267", + "exception": false, + "start_time": "2025-12-19T10:23:41.517168", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.2. Plots" + ] + }, + { + "cell_type": "markdown", + "id": "b7781198", + "metadata": { + "papermill": { + "duration": 0.000056, + "end_time": "2025-12-19T10:23:41.517425", + "exception": false, + "start_time": "2025-12-19T10:23:41.517369", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.1 Scatter plot of RR over time (by ADM2)\n", + "With this we can see the actula numbners (although cannot tell which ADM2 have low values)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78d92e4a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.522513Z", + "iopub.status.busy": "2025-12-19T10:23:41.520272Z", + "iopub.status.idle": "2025-12-19T10:23:42.935181Z", + "shell.execute_reply": "2025-12-19T10:23:42.932661Z" + }, + "papermill": { + "duration": 1.456494, + "end_time": "2025-12-19T10:23:42.974012", + "exception": false, + "start_time": "2025-12-19T10:23:41.517518", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Line point plot faceted by YEAR\n", + "ggplot(data = data_to_plot) +\n", + " geom_line(aes(x = MONTH,\n", + " y = REPORTING_RATE,\n", + " group = ADM2_ID,\n", + " color = REPORTING_RATE_CATEGORY), \n", + " alpha = 0.3,\n", + " show.legend = FALSE\n", + " ) +\n", + " geom_point(aes(x = MONTH,\n", + " y = REPORTING_RATE,\n", + " group = ADM2_ID,\n", + " color = REPORTING_RATE_CATEGORY)) + \n", + " facet_grid(~YEAR) + \n", + " scale_color_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\",\n", + " name = \"Reporting Rate Categories\"\n", + " ) +\n", + " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", + " scale_y_continuous(\n", + " breaks = c(0, break_vals), # 🎨 NEW dynamic colors & breaks!\n", + " # Dynamically set max value to fit actual data (do show values >1 if present)\n", + " limits = c(0, max(data_to_plot$REPORTING_RATE, na.rm = TRUE) + 0.1)\n", + " ) +\n", + " labs(\n", + " title = \"Reporting Rate (Dataset)\",\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " x = \"Month\",\n", + " y = \"Reporting Rate\\n(Dataset)\" ) +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", + " legend.position = \"none\",\n", + " legend.title = element_blank(),\n", + " # legend.key.width = unit(3, \"cm\"),\n", + " # legend.key.height = unit(0.25, \"cm\"),\n", + " axis.title.y = element_blank(),\n", + " panel.grid.minor = element_blank(),\n", + " panel.grid.major.x = element_blank(),\n", + " strip.placement = \"outside\",\n", + " strip.text = element_text(face = \"bold\", size = 10)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f47064a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:42.978498Z", + "iopub.status.busy": "2025-12-19T10:23:42.976659Z", + "iopub.status.idle": "2025-12-19T10:23:44.087244Z", + "shell.execute_reply": "2025-12-19T10:23:44.085182Z" + }, + "papermill": { + "duration": 1.11568, + "end_time": "2025-12-19T10:23:44.089891", + "exception": false, + "start_time": "2025-12-19T10:23:42.974211", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Export plot as PNG\n", + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_linepoint_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " height = 15,\n", + " width = 45,\n", + " units = \"cm\",\n", + " bg = \"white\",\n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (linepoint) saved to: {file.path(output_location, output_filename)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "22bb6431", + "metadata": { + "papermill": { + "duration": 0.000147, + "end_time": "2025-12-19T10:23:44.090320", + "exception": false, + "start_time": "2025-12-19T10:23:44.090173", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.2 Heatmap plot of RR over time (by ADM2)\n", + "This is less good for identifying actual values, but allows to see which ADM2 have lower values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2445f2a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:44.094508Z", + "iopub.status.busy": "2025-12-19T10:23:44.092577Z", + "iopub.status.idle": "2025-12-19T10:23:46.262550Z", + "shell.execute_reply": "2025-12-19T10:23:46.259633Z" + }, + "papermill": { + "duration": 2.21647, + "end_time": "2025-12-19T10:23:46.306927", + "exception": false, + "start_time": "2025-12-19T10:23:44.090457", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Tile plot faceted by YEAR\n", + "ggplot(data = data_to_plot) +\n", + " geom_tile(aes(x = MONTH,\n", + " y = fct_rev(ADM2_NAME),\n", + " fill = REPORTING_RATE_CATEGORY), \n", + " color = \"white\",\n", + " show.legend = TRUE\n", + " ) +\n", + " scale_fill_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\",\n", + " name = \"Reporting Rate: \"\n", + " ) +\n", + " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", + " labs(\n", + " title = \"Reporting Rate (Dataset)\",\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " x = \"Month\"\n", + " ) +\n", + " facet_grid(rows = vars(ADM1_NAME), cols = vars(YEAR), \n", + " scales = \"free_y\", space = \"free_y\",\n", + " switch = \"y\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", + " legend.position = \"bottom\",\n", + " legend.key.height = unit(0.25, \"cm\"),\n", + " axis.text.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " panel.grid.minor = element_blank(),\n", + " panel.grid.major = element_blank(),\n", + " strip.placement = \"outside\", \n", + " strip.text = element_text(face = \"bold\", size = 10)\n", + " ) +\n", + " guides(fill = guide_legend(nrow = 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbe73312", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:46.311134Z", + "iopub.status.busy": "2025-12-19T10:23:46.309412Z", + "iopub.status.idle": "2025-12-19T10:23:48.286664Z", + "shell.execute_reply": "2025-12-19T10:23:48.284571Z" + }, + "papermill": { + "duration": 1.982105, + "end_time": "2025-12-19T10:23:48.289215", + "exception": false, + "start_time": "2025-12-19T10:23:46.307110", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Export plot as PNG\n", + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_heatmap_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " width = 20, height = 30, units = \"cm\", \n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (heatmap) saved to: {file.path(output_location, output_filename)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "3eef141a", + "metadata": { + "papermill": { + "duration": 0.000164, + "end_time": "2025-12-19T10:23:48.289656", + "exception": false, + "start_time": "2025-12-19T10:23:48.289492", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.3. MAP of Reporting Rate - by month" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83be9c68", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:48.294030Z", + "iopub.status.busy": "2025-12-19T10:23:48.292256Z", + "iopub.status.idle": "2025-12-19T10:23:53.205670Z", + "shell.execute_reply": "2025-12-19T10:23:53.203104Z" + }, + "papermill": { + "duration": 4.958481, + "end_time": "2025-12-19T10:23:53.248341", + "exception": false, + "start_time": "2025-12-19T10:23:48.289860", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Choropleth map with reporting rate data by ADM2\n", + "ggplot(data = data_to_plot) +\n", + " geom_sf(aes(\n", + " fill = REPORTING_RATE_CATEGORY,\n", + " geometry = geometry), \n", + " color = \"white\",\n", + " size = 0.01) +\n", + " scale_fill_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\",\n", + " ) +\n", + " theme_void() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " legend.key.height = unit(0.25, \"cm\")\n", + " ) +\n", + " labs(\n", + " title = paste(\"Reporting Rate (Dataset)\"),\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " ) +\n", + " facet_grid(\n", + " rows = vars(YEAR), \n", + " cols = vars(MONTH),\n", + " switch = \"both\") +\n", + " guides(fill = guide_legend(nrow = 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e877671d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:53.252696Z", + "iopub.status.busy": "2025-12-19T10:23:53.250972Z", + "iopub.status.idle": "2025-12-19T10:23:56.748868Z", + "shell.execute_reply": "2025-12-19T10:23:56.746990Z" + }, + "papermill": { + "duration": 3.502689, + "end_time": "2025-12-19T10:23:56.751218", + "exception": false, + "start_time": "2025-12-19T10:23:53.248529", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " width = 50, height = 20, units = \"cm\", \n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "f0894be9", + "metadata": { + "papermill": { + "duration": 0.000166, + "end_time": "2025-12-19T10:23:56.751636", + "exception": false, + "start_time": "2025-12-19T10:23:56.751470", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.4. MAP of Reporting Rate - by YEAR\n", + "Use average (`mean()`) of monthly values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb1995ab", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:56.755998Z", + "iopub.status.busy": "2025-12-19T10:23:56.753982Z", + "iopub.status.idle": "2025-12-19T10:23:56.788391Z", + "shell.execute_reply": "2025-12-19T10:23:56.786447Z" + }, + "papermill": { + "duration": 0.039325, + "end_time": "2025-12-19T10:23:56.791143", + "exception": false, + "start_time": "2025-12-19T10:23:56.751818", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "data_to_plot_year <- data_to_plot %>%\n", + " group_by(geometry, ADM2_ID, ADM2_NAME, ADM1_NAME, YEAR) %>%\n", + " summarise(\n", + " REPORTING_RATE = mean(REPORTING_RATE, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " # Calculate REPORTING_RATE_CATEGORY again based on the yearly average\n", + " mutate(\n", + " REPORTING_RATE_CATEGORY = cut(\n", + " REPORTING_RATE,\n", + " breaks = full_breaks,\n", + " labels = labels,\n", + " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", + " include.lowest = TRUE\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd32b0cf", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:56.795010Z", + "iopub.status.busy": "2025-12-19T10:23:56.793453Z", + "iopub.status.idle": "2025-12-19T10:23:57.582261Z", + "shell.execute_reply": "2025-12-19T10:23:57.579294Z" + }, + "papermill": { + "duration": 0.798686, + "end_time": "2025-12-19T10:23:57.590023", + "exception": false, + "start_time": "2025-12-19T10:23:56.791337", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Choropleth map with reporting rate data by ADM2\n", + "ggplot(data = data_to_plot_year) +\n", + " geom_sf(aes(\n", + " fill = REPORTING_RATE_CATEGORY,\n", + " geometry = geometry), \n", + " color = \"white\",\n", + " size = 0.01) +\n", + " scale_fill_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\"\n", + " ) +\n", + " theme_void() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", + " legend.position = \"bottom\",\n", + " ) +\n", + " labs(\n", + " title = \"Reporting Rate (Dataset) - mean per Year\",\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " fill = \"Reporting Rate: \"\n", + " ) +\n", + " facet_grid(\n", + " cols = vars(YEAR)\n", + " ) +\n", + " guides(fill = guide_legend(nrow = 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0430641e", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:57.594096Z", + "iopub.status.busy": "2025-12-19T10:23:57.592357Z", + "iopub.status.idle": "2025-12-19T10:23:58.516754Z", + "shell.execute_reply": "2025-12-19T10:23:58.514785Z" + }, + "papermill": { + "duration": 0.928933, + "end_time": "2025-12-19T10:23:58.519148", + "exception": false, + "start_time": "2025-12-19T10:23:57.590215", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_year_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " width = 31, height = 13, units = \"cm\", \n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "8c3bdca4", + "metadata": { + "papermill": { + "duration": 0.000126, + "end_time": "2025-12-19T10:23:58.519515", + "exception": false, + "start_time": "2025-12-19T10:23:58.519389", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### The End :)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8a62ec5", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:58.523680Z", + "iopub.status.busy": "2025-12-19T10:23:58.522024Z", + "iopub.status.idle": "2025-12-19T10:23:58.733860Z", + "shell.execute_reply": "2025-12-19T10:23:58.731929Z" + }, + "papermill": { + "duration": 0.216448, + "end_time": "2025-12-19T10:23:58.736160", + "exception": false, + "start_time": "2025-12-19T10:23:58.519712", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(\"Reporting Rate (Dataset) report notebook completed successfully!\")" + ] } - }, - "outputs": [], - "source": [ - "# Make string of product uids for plot subtitles\n", - "rr_product_uid <-paste(REPORTING_RATE_PRODUCT_UID,collapse = \", \") \n", - "rr_product_uid" - ] - }, - { - "cell_type": "markdown", - "id": "30b058f4", - "metadata": { - "papermill": { - "duration": 0.000094, - "end_time": "2025-12-19T10:23:35.399231", - "exception": false, - "start_time": "2025-12-19T10:23:35.399137", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.2. Load and check `snt metadata` file\n", - "This is needed for the correct use of palettes and categories (breaks, or scale)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98a8ee49", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:35.403224Z", - "iopub.status.busy": "2025-12-19T10:23:35.401458Z", - "iopub.status.idle": "2025-12-19T10:23:36.335964Z", - "shell.execute_reply": "2025-12-19T10:23:36.330643Z" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, "papermill": { - "duration": 0.940593, - "end_time": "2025-12-19T10:23:36.339927", - "exception": false, - "start_time": "2025-12-19T10:23:35.399334", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" + "default_parameters": {}, + "duration": 32.950872, + "end_time": "2025-12-19T10:23:59.058917", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs/snt_dhis2_reporting_rate_dataset_report_OUTPUT_2025-12-19_102325.ipynb", + "parameters": {}, + "start_time": "2025-12-19T10:23:26.108045", + "version": "2.6.0" } - }, - "outputs": [], - "source": [ - "# Load SNT metadata\n", - "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00681217", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:36.357945Z", - "iopub.status.busy": "2025-12-19T10:23:36.343228Z", - "iopub.status.idle": "2025-12-19T10:23:36.535579Z", - "shell.execute_reply": "2025-12-19T10:23:36.533231Z" - }, - "papermill": { - "duration": 0.198107, - "end_time": "2025-12-19T10:23:36.538224", - "exception": false, - "start_time": "2025-12-19T10:23:36.340117", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "break_vals <- jsonlite::fromJSON(metadata_json$REPORTING_RATE$SCALE)\n", - "\n", - "log_msg(paste0(\"Reporting Rate scale break values loaded from SNT_metadata.json : \", paste(break_vals, collapse = \", \")))" - ] - }, - { - "cell_type": "markdown", - "id": "f3470564", - "metadata": { - "papermill": { - "duration": 0.000162, - "end_time": "2025-12-19T10:23:36.538638", - "exception": false, - "start_time": "2025-12-19T10:23:36.538476", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "82397307", - "metadata": { - "papermill": { - "duration": 0.000126, - "end_time": "2025-12-19T10:23:36.538947", - "exception": false, - "start_time": "2025-12-19T10:23:36.538821", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 2.1. Output of pipeline notebook\n", - "Import file named `{COUNTRY_CODE}_reporting_rate_dataset.parquet` from **OH Dataset** \"SNT_DHIS2_REPORTING_RATE\" (as in `config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70acb2c5", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:36.543564Z", - "iopub.status.busy": "2025-12-19T10:23:36.541311Z", - "iopub.status.idle": "2025-12-19T10:23:37.788619Z", - "shell.execute_reply": "2025-12-19T10:23:37.785121Z" - }, - "papermill": { - "duration": 1.253125, - "end_time": "2025-12-19T10:23:37.792249", - "exception": false, - "start_time": "2025-12-19T10:23:36.539124", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "\n", - "reporting_rate_dataset <- tryCatch({ get_latest_dataset_file_in_memory(REPORTING_RATE_DATASET_NAME, glue::glue(\"{COUNTRY_CODE}_reporting_rate_dataset.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading Reporting Rate (Dataset) data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "# log\n", - "log_msg(glue::glue(\"Data file loaded from dataset : {REPORTING_RATE_DATASET_NAME} dataframe dimensions: {paste(dim(reporting_rate_dataset), collapse=', ')}\"))\n", - "dim(reporting_rate_dataset)\n", - "head(reporting_rate_dataset, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "48833515", - "metadata": { - "papermill": { - "duration": 0.000091, - "end_time": "2025-12-19T10:23:37.792528", - "exception": false, - "start_time": "2025-12-19T10:23:37.792437", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 2.2. Shapes\n", - "To make choropleth (map)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3febd4f4", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:37.798194Z", - "iopub.status.busy": "2025-12-19T10:23:37.795402Z", - "iopub.status.idle": "2025-12-19T10:23:41.325848Z", - "shell.execute_reply": "2025-12-19T10:23:41.323895Z" - }, - "papermill": { - "duration": 3.535554, - "end_time": "2025-12-19T10:23:41.328226", - "exception": false, - "start_time": "2025-12-19T10:23:37.792672", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "shapes <- tryCatch({ get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET_NAME, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) { \n", - " msg <- paste0(COUNTRY_CODE , \" Shapes data is not available in dataset: '\" , DHIS2_FORMATTED_DATASET_NAME, \"' last version.\")\n", - " log_msg(msg, \"warning\")\n", - " shapes <- NULL\n", - " })\n", - "\n", - "log_msg(glue::glue(\"Shapes loaded from dataset: '{DHIS2_FORMATTED_DATASET_NAME}'. \\nDataframe with dimensions: {paste(dim(shapes), collapse=', ')}\"))\n", - "names(shapes)" - ] - }, - { - "cell_type": "markdown", - "id": "17067d56", - "metadata": { - "papermill": { - "duration": 0.000166, - "end_time": "2025-12-19T10:23:41.328651", - "exception": false, - "start_time": "2025-12-19T10:23:41.328485", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3. Plots" - ] - }, - { - "cell_type": "markdown", - "id": "9a6369ee", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2025-12-19T10:23:41.328959", - "exception": false, - "start_time": "2025-12-19T10:23:41.328850", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.0. Add shapes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6641720", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.333105Z", - "iopub.status.busy": "2025-12-19T10:23:41.331427Z", - "iopub.status.idle": "2025-12-19T10:23:41.365417Z", - "shell.execute_reply": "2025-12-19T10:23:41.363294Z" - }, - "papermill": { - "duration": 0.03905, - "end_time": "2025-12-19T10:23:41.368213", - "exception": false, - "start_time": "2025-12-19T10:23:41.329163", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Join shapes to reporting rate data\n", - "\n", - "data_to_plot <- reporting_rate_dataset %>%\n", - " left_join(shapes, by = c(\"ADM2_ID\"))" - ] - }, - { - "cell_type": "markdown", - "id": "0b0d32f1", - "metadata": { - "papermill": { - "duration": 0.000195, - "end_time": "2025-12-19T10:23:41.368739", - "exception": false, - "start_time": "2025-12-19T10:23:41.368544", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.1. 🎨 Dynamic categories and color assignement" - ] - }, - { - "cell_type": "markdown", - "id": "cc765e0c", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2025-12-19T10:23:41.369057", - "exception": false, - "start_time": "2025-12-19T10:23:41.368948", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 1. Define breaks and labels" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e79132c", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.373558Z", - "iopub.status.busy": "2025-12-19T10:23:41.371555Z", - "iopub.status.idle": "2025-12-19T10:23:41.392950Z", - "shell.execute_reply": "2025-12-19T10:23:41.390333Z" - }, - "papermill": { - "duration": 0.026996, - "end_time": "2025-12-19T10:23:41.396238", - "exception": false, - "start_time": "2025-12-19T10:23:41.369242", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Safety code to avoid breaking if nothings is fund in json_metadata\n", - "if (is.null(break_vals) || length(break_vals) == 0) {\n", - " log_msg(\"[WARNING] No break values found in SNT_metadata.json for REPORTING_RATE$SCALE. Using default values.\", \"warning\")\n", - " break_vals <- c(0.5, 0.8, 0.9, 0.95, 1.00)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f04cb888", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.401034Z", - "iopub.status.busy": "2025-12-19T10:23:41.398849Z", - "iopub.status.idle": "2025-12-19T10:23:41.430720Z", - "shell.execute_reply": "2025-12-19T10:23:41.428238Z" - }, - "papermill": { - "duration": 0.037712, - "end_time": "2025-12-19T10:23:41.434131", - "exception": false, - "start_time": "2025-12-19T10:23:41.396419", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 1. Define breaks\n", - "# Note: assumes that the data starts at 0!\n", - "# break_vals <- metadata_json$REPORTING_RATE$SCALE # moved upstream\n", - "\n", - "# 2. Create the full set of cut points (0 to Infinity)\n", - "full_breaks <- c(0, break_vals, Inf)\n", - "\n", - "# 3. Create dynamic labels\n", - "labels <- c(\n", - " paste0(\"< \", break_vals[1]), # First label\n", - " paste0(break_vals[-length(break_vals)], \" - \", break_vals[-1]), # Middle labels\n", - " paste0(\"> \", break_vals[length(break_vals)]) # Last label\n", - ")\n", - "\n", - "# Check\n", - "labels" - ] - }, - { - "cell_type": "markdown", - "id": "cb237801", - "metadata": { - "papermill": { - "duration": 0.000102, - "end_time": "2025-12-19T10:23:41.434442", - "exception": false, - "start_time": "2025-12-19T10:23:41.434340", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 2. Create `_CATEGORY` col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8303488", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.439376Z", - "iopub.status.busy": "2025-12-19T10:23:41.437165Z", - "iopub.status.idle": "2025-12-19T10:23:41.471891Z", - "shell.execute_reply": "2025-12-19T10:23:41.469251Z" - }, - "papermill": { - "duration": 0.040632, - "end_time": "2025-12-19T10:23:41.475176", - "exception": false, - "start_time": "2025-12-19T10:23:41.434544", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# reporting_rate_dataset <- reporting_rate_dataset %>%\n", - "data_to_plot <- data_to_plot %>%\n", - " mutate(\n", - " REPORTING_RATE_CATEGORY = cut(\n", - " REPORTING_RATE,\n", - " breaks = full_breaks,\n", - " labels = labels,\n", - " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", - " include.lowest = TRUE\n", - " )\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "a10237f8", - "metadata": { - "papermill": { - "duration": 0.000102, - "end_time": "2025-12-19T10:23:41.475483", - "exception": false, - "start_time": "2025-12-19T10:23:41.475381", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3. Pick appropriate palette" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ee6e077", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.480216Z", - "iopub.status.busy": "2025-12-19T10:23:41.478061Z", - "iopub.status.idle": "2025-12-19T10:23:41.513805Z", - "shell.execute_reply": "2025-12-19T10:23:41.511268Z" - }, - "papermill": { - "duration": 0.04138, - "end_time": "2025-12-19T10:23:41.516984", - "exception": false, - "start_time": "2025-12-19T10:23:41.475604", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Count nr of breaks\n", - "nr_of_colors <- length(labels)\n", - "\n", - "# nr_of_colors\n", - "palette_to_use <- get_range_from_count(nr_of_colors)\n", - "\n", - "# Need to make palettes as named vectors so that scale_color_manual() and scale_fill_manual() can use them properly\n", - "# Note: need to reverse order of labels to match the palette order \"meaning\" (red \"\" should correcpond to lowest value)\n", - "names(palette_to_use) <- rev(labels)\n", - "\n", - "palette_to_use\n" - ] - }, - { - "cell_type": "markdown", - "id": "d08c0c14", - "metadata": { - "papermill": { - "duration": 0.000099, - "end_time": "2025-12-19T10:23:41.517267", - "exception": false, - "start_time": "2025-12-19T10:23:41.517168", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.2. Plots" - ] - }, - { - "cell_type": "markdown", - "id": "b7781198", - "metadata": { - "papermill": { - "duration": 0.000056, - "end_time": "2025-12-19T10:23:41.517425", - "exception": false, - "start_time": "2025-12-19T10:23:41.517369", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.1 Scatter plot of RR over time (by ADM2)\n", - "With this we can see the actula numbners (although cannot tell which ADM2 have low values)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78d92e4a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.522513Z", - "iopub.status.busy": "2025-12-19T10:23:41.520272Z", - "iopub.status.idle": "2025-12-19T10:23:42.935181Z", - "shell.execute_reply": "2025-12-19T10:23:42.932661Z" - }, - "papermill": { - "duration": 1.456494, - "end_time": "2025-12-19T10:23:42.974012", - "exception": false, - "start_time": "2025-12-19T10:23:41.517518", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Line point plot faceted by YEAR\n", - "ggplot(data = data_to_plot) +\n", - " geom_line(aes(x = MONTH,\n", - " y = REPORTING_RATE,\n", - " group = ADM2_ID,\n", - " color = REPORTING_RATE_CATEGORY), \n", - " alpha = 0.3,\n", - " show.legend = FALSE\n", - " ) +\n", - " geom_point(aes(x = MONTH,\n", - " y = REPORTING_RATE,\n", - " group = ADM2_ID,\n", - " color = REPORTING_RATE_CATEGORY)) + \n", - " facet_grid(~YEAR) + \n", - " scale_color_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\",\n", - " name = \"Reporting Rate Categories\"\n", - " ) +\n", - " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", - " scale_y_continuous(\n", - " breaks = c(0, break_vals), # 🎨 NEW dynamic colors & breaks!\n", - " # Dynamically set max value to fit actual data (do show values >1 if present)\n", - " limits = c(0, max(data_to_plot$REPORTING_RATE, na.rm = TRUE) + 0.1)\n", - " ) +\n", - " labs(\n", - " title = \"Reporting Rate (Dataset)\",\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " x = \"Month\",\n", - " y = \"Reporting Rate\\n(Dataset)\" ) +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", - " legend.position = \"none\",\n", - " legend.title = element_blank(),\n", - " # legend.key.width = unit(3, \"cm\"),\n", - " # legend.key.height = unit(0.25, \"cm\"),\n", - " axis.title.y = element_blank(),\n", - " panel.grid.minor = element_blank(),\n", - " panel.grid.major.x = element_blank(),\n", - " strip.placement = \"outside\",\n", - " strip.text = element_text(face = \"bold\", size = 10)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f47064a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:42.978498Z", - "iopub.status.busy": "2025-12-19T10:23:42.976659Z", - "iopub.status.idle": "2025-12-19T10:23:44.087244Z", - "shell.execute_reply": "2025-12-19T10:23:44.085182Z" - }, - "papermill": { - "duration": 1.11568, - "end_time": "2025-12-19T10:23:44.089891", - "exception": false, - "start_time": "2025-12-19T10:23:42.974211", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Export plot as PNG\n", - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_linepoint_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " height = 15,\n", - " width = 45,\n", - " units = \"cm\",\n", - " bg = \"white\",\n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (linepoint) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "22bb6431", - "metadata": { - "papermill": { - "duration": 0.000147, - "end_time": "2025-12-19T10:23:44.090320", - "exception": false, - "start_time": "2025-12-19T10:23:44.090173", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.2 Heatmap plot of RR over time (by ADM2)\n", - "This is less good for identifying actual values, but allows to see which ADM2 have lower values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f2445f2a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:44.094508Z", - "iopub.status.busy": "2025-12-19T10:23:44.092577Z", - "iopub.status.idle": "2025-12-19T10:23:46.262550Z", - "shell.execute_reply": "2025-12-19T10:23:46.259633Z" - }, - "papermill": { - "duration": 2.21647, - "end_time": "2025-12-19T10:23:46.306927", - "exception": false, - "start_time": "2025-12-19T10:23:44.090457", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Tile plot faceted by YEAR\n", - "ggplot(data = data_to_plot) +\n", - " geom_tile(aes(x = MONTH,\n", - " y = fct_rev(ADM2_NAME),\n", - " fill = REPORTING_RATE_CATEGORY), \n", - " color = \"white\",\n", - " show.legend = TRUE\n", - " ) +\n", - " scale_fill_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\",\n", - " name = \"Reporting Rate: \"\n", - " ) +\n", - " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", - " labs(\n", - " title = \"Reporting Rate (Dataset)\",\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " x = \"Month\"\n", - " ) +\n", - " facet_grid(rows = vars(ADM1_NAME), cols = vars(YEAR), \n", - " scales = \"free_y\", space = \"free_y\",\n", - " switch = \"y\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", - " legend.position = \"bottom\",\n", - " legend.key.height = unit(0.25, \"cm\"),\n", - " axis.text.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " panel.grid.minor = element_blank(),\n", - " panel.grid.major = element_blank(),\n", - " strip.placement = \"outside\", \n", - " strip.text = element_text(face = \"bold\", size = 10)\n", - " ) +\n", - " guides(fill = guide_legend(nrow = 1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cbe73312", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:46.311134Z", - "iopub.status.busy": "2025-12-19T10:23:46.309412Z", - "iopub.status.idle": "2025-12-19T10:23:48.286664Z", - "shell.execute_reply": "2025-12-19T10:23:48.284571Z" - }, - "papermill": { - "duration": 1.982105, - "end_time": "2025-12-19T10:23:48.289215", - "exception": false, - "start_time": "2025-12-19T10:23:46.307110", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Export plot as PNG\n", - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_heatmap_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " width = 20, height = 30, units = \"cm\", \n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (heatmap) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3eef141a", - "metadata": { - "papermill": { - "duration": 0.000164, - "end_time": "2025-12-19T10:23:48.289656", - "exception": false, - "start_time": "2025-12-19T10:23:48.289492", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.3. MAP of Reporting Rate - by month" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83be9c68", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:48.294030Z", - "iopub.status.busy": "2025-12-19T10:23:48.292256Z", - "iopub.status.idle": "2025-12-19T10:23:53.205670Z", - "shell.execute_reply": "2025-12-19T10:23:53.203104Z" - }, - "papermill": { - "duration": 4.958481, - "end_time": "2025-12-19T10:23:53.248341", - "exception": false, - "start_time": "2025-12-19T10:23:48.289860", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Choropleth map with reporting rate data by ADM2\n", - "ggplot(data = data_to_plot) +\n", - " geom_sf(aes(\n", - " fill = REPORTING_RATE_CATEGORY,\n", - " geometry = geometry), \n", - " color = \"white\",\n", - " size = 0.01) +\n", - " scale_fill_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\",\n", - " ) +\n", - " theme_void() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " legend.key.height = unit(0.25, \"cm\")\n", - " ) +\n", - " labs(\n", - " title = paste(\"Reporting Rate (Dataset)\"),\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " ) +\n", - " facet_grid(\n", - " rows = vars(YEAR), \n", - " cols = vars(MONTH),\n", - " switch = \"both\") +\n", - " guides(fill = guide_legend(nrow = 1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e877671d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:53.252696Z", - "iopub.status.busy": "2025-12-19T10:23:53.250972Z", - "iopub.status.idle": "2025-12-19T10:23:56.748868Z", - "shell.execute_reply": "2025-12-19T10:23:56.746990Z" - }, - "papermill": { - "duration": 3.502689, - "end_time": "2025-12-19T10:23:56.751218", - "exception": false, - "start_time": "2025-12-19T10:23:53.248529", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " width = 50, height = 20, units = \"cm\", \n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "f0894be9", - "metadata": { - "papermill": { - "duration": 0.000166, - "end_time": "2025-12-19T10:23:56.751636", - "exception": false, - "start_time": "2025-12-19T10:23:56.751470", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.4. MAP of Reporting Rate - by YEAR\n", - "Use average (`mean()`) of monthly values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb1995ab", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:56.755998Z", - "iopub.status.busy": "2025-12-19T10:23:56.753982Z", - "iopub.status.idle": "2025-12-19T10:23:56.788391Z", - "shell.execute_reply": "2025-12-19T10:23:56.786447Z" - }, - "papermill": { - "duration": 0.039325, - "end_time": "2025-12-19T10:23:56.791143", - "exception": false, - "start_time": "2025-12-19T10:23:56.751818", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "data_to_plot_year <- data_to_plot %>%\n", - " group_by(geometry, ADM2_ID, ADM2_NAME, ADM1_NAME, YEAR) %>%\n", - " summarise(\n", - " REPORTING_RATE = mean(REPORTING_RATE, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " # Calculate REPORTING_RATE_CATEGORY again based on the yearly average\n", - " mutate(\n", - " REPORTING_RATE_CATEGORY = cut(\n", - " REPORTING_RATE,\n", - " breaks = full_breaks,\n", - " labels = labels,\n", - " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", - " include.lowest = TRUE\n", - " )\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd32b0cf", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:56.795010Z", - "iopub.status.busy": "2025-12-19T10:23:56.793453Z", - "iopub.status.idle": "2025-12-19T10:23:57.582261Z", - "shell.execute_reply": "2025-12-19T10:23:57.579294Z" - }, - "papermill": { - "duration": 0.798686, - "end_time": "2025-12-19T10:23:57.590023", - "exception": false, - "start_time": "2025-12-19T10:23:56.791337", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Choropleth map with reporting rate data by ADM2\n", - "ggplot(data = data_to_plot_year) +\n", - " geom_sf(aes(\n", - " fill = REPORTING_RATE_CATEGORY,\n", - " geometry = geometry), \n", - " color = \"white\",\n", - " size = 0.01) +\n", - " scale_fill_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\"\n", - " ) +\n", - " theme_void() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", - " legend.position = \"bottom\",\n", - " ) +\n", - " labs(\n", - " title = \"Reporting Rate (Dataset) - mean per Year\",\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " fill = \"Reporting Rate: \"\n", - " ) +\n", - " facet_grid(\n", - " cols = vars(YEAR)\n", - " ) +\n", - " guides(fill = guide_legend(nrow = 1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0430641e", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:57.594096Z", - "iopub.status.busy": "2025-12-19T10:23:57.592357Z", - "iopub.status.idle": "2025-12-19T10:23:58.516754Z", - "shell.execute_reply": "2025-12-19T10:23:58.514785Z" - }, - "papermill": { - "duration": 0.928933, - "end_time": "2025-12-19T10:23:58.519148", - "exception": false, - "start_time": "2025-12-19T10:23:57.590215", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_year_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " width = 31, height = 13, units = \"cm\", \n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "8c3bdca4", - "metadata": { - "papermill": { - "duration": 0.000126, - "end_time": "2025-12-19T10:23:58.519515", - "exception": false, - "start_time": "2025-12-19T10:23:58.519389", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### The End :)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8a62ec5", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:58.523680Z", - "iopub.status.busy": "2025-12-19T10:23:58.522024Z", - "iopub.status.idle": "2025-12-19T10:23:58.733860Z", - "shell.execute_reply": "2025-12-19T10:23:58.731929Z" - }, - "papermill": { - "duration": 0.216448, - "end_time": "2025-12-19T10:23:58.736160", - "exception": false, - "start_time": "2025-12-19T10:23:58.519712", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(\"Reporting Rate (Dataset) report notebook completed successfully!\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" }, - "papermill": { - "default_parameters": {}, - "duration": 32.950872, - "end_time": "2025-12-19T10:23:59.058917", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs/snt_dhis2_reporting_rate_dataset_report_OUTPUT_2025-12-19_102325.ipynb", - "parameters": {}, - "start_time": "2025-12-19T10:23:26.108045", - "version": "2.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/snt_dhis2_reporting_rate_dataelement/pipeline.py b/snt_dhis2_reporting_rate_dataelement/pipeline.py index a4679d0..458602a 100644 --- a/snt_dhis2_reporting_rate_dataelement/pipeline.py +++ b/snt_dhis2_reporting_rate_dataelement/pipeline.py @@ -15,36 +15,18 @@ @pipeline("snt_dhis2_reporting_rate_dataelement") @parameter( - "outliers_method", - name="Outliers detection method", - help="Specify which method was used to detect outliers in routine data. " - "Chose 'Routine data (Raw)' to use raw routine data.", + "routine_data_choice", + name="Routine data source", + help="Select which routine data to use. " + "'raw' loads formatted routine data, " + "'imputed' loads outliers-imputed routine data, " + "'outliers_removed' loads routine data with outliers removed.", multiple=False, - choices=[ - "Routine data (Raw)", - "Mean (Classic)", - "Median (Classic)", - "IQR (Classic)", - "Trend (PATH)", - "MG Partial (MagicGlasses2)", - "MG Complete (MagicGlasses2)", - ], + choices=["raw", "imputed", "outliers_removed"], type=str, - default="Routine data (Raw)", + default="imputed", required=True, ) -@parameter( - "use_removed_outliers", - name="Use routine data with outliers removed (else: uses imputed)", - help="Enable this option to use routine data after outliers have been removed, " - "based on the outlier detection method you selected above. " - " If you leave this off, the pipeline will instead use either:" - " A) the imputed routine data (where outlier values have been replaced), or" - " B) the raw routine data, if you chose 'Routine data (Raw)' as your outlier processing method.", - type=bool, - default=False, - required=False, -) @parameter( "activity_indicators", name="Facility Activity indicators", @@ -104,8 +86,7 @@ required=False, ) def snt_dhis2_reporting_rate_dataelement( - outliers_method: str, - use_removed_outliers: bool, + routine_data_choice: str, activity_indicators: str, volume_activity_indicators: str, dataelement_method_denominator: str, @@ -135,9 +116,12 @@ def snt_dhis2_reporting_rate_dataelement( country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] # Build parameters dict and save to JSON in all cases (like other pipelines) - routine_file = f"{country_code}{resolve_routine_filename(outliers_method, use_removed_outliers)}" + routine_file = resolve_routine_filename( + country_code=country_code, routine_data_choice=routine_data_choice + ) nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), + "ROUTINE_DATA_CHOICE": routine_data_choice, "ROUTINE_FILE": routine_file, "DATAELEMENT_METHOD_DENOMINATOR": dataelement_method_denominator, "ACTIVITY_INDICATORS": activity_indicators, @@ -153,7 +137,7 @@ def snt_dhis2_reporting_rate_dataelement( current_run.log_info(f"Saved pipeline parameters to {parameters_file}") if not run_report_only: - if outliers_method == "Routine data (Raw)": + if routine_data_choice == "raw": ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_DATASET_FORMATTED"] else: ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] @@ -203,44 +187,18 @@ def snt_dhis2_reporting_rate_dataelement( raise -def resolve_routine_filename(outliers_method: str, is_removed: bool) -> str: - """Returns the routine data filename based on the selected outliers method. - - Parameters - ---------- - outliers_method : str - The method used for outlier removal. - is_removed : bool - Whether to return the filename for removed outliers or imputed outliers. - - Returns - ------- - str - The filename corresponding to the selected outliers method. - - Raises - ------ - ValueError - If the outliers method is unknown. - """ - if outliers_method == "Routine data (Raw)": - return "_routine.parquet" - - method_suffix_map = { - "Mean (Classic)": "mean", - "Median (Classic)": "median", - "IQR (Classic)": "iqr", - "Trend (PATH)": "trend", - "MG Partial (MagicGlasses2)": "mg-partial", - "MG Complete (MagicGlasses2)": "mg-complete", - } +def resolve_routine_filename(country_code: str, routine_data_choice: str) -> str: + """Returns the canonical routine filename for a routine data choice.""" + if routine_data_choice == "raw": + return f"{country_code}_routine.parquet" - try: - suffix = method_suffix_map[outliers_method] - except KeyError as err: - raise ValueError(f"Unknown outliers method: {outliers_method}") from err + if routine_data_choice == "imputed": + return f"{country_code}_routine_outliers_imputed.parquet" + + if routine_data_choice == "outliers_removed": + return f"{country_code}_routine_outliers_removed.parquet" - return f"_routine_outliers-{suffix}{'_removed' if is_removed else '_imputed'}.parquet" + raise ValueError(f"Unknown routine data choice: {routine_data_choice}") if __name__ == "__main__": diff --git a/snt_dhis2_reporting_rate_dataset/pipeline.py b/snt_dhis2_reporting_rate_dataset/pipeline.py index b52c32c..46e6289 100644 --- a/snt_dhis2_reporting_rate_dataset/pipeline.py +++ b/snt_dhis2_reporting_rate_dataset/pipeline.py @@ -89,6 +89,7 @@ def snt_dhis2_reporting_rate_dataset( nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), + "ROUTINE_DATA_CHOICE": routine_data_choice, "ROUTINE_FILE": routine_file, }