From 01e017e79aebfaee899fc0689592884ec00d3f3e Mon Sep 17 00:00:00 2001 From: EMontandon Date: Fri, 13 Mar 2026 09:42:09 +0100 Subject: [PATCH 1/8] review branch --- d2d_development/README.md | 6 + d2d_development/d2d_development/__init__.py | 0 .../d2d_development/data_models.py | 228 +++++++++ .../d2d_development/dataset_completion.py | 422 +++++++++++++++ d2d_development/d2d_development/exceptions.py | 6 + d2d_development/d2d_development/extract.py | 462 +++++++++++++++++ .../d2d_development/org_unit_aligner.py | 351 +++++++++++++ d2d_development/d2d_development/push.py | 335 ++++++++++++ d2d_development/d2d_development/utils.py | 89 ++++ d2d_development/pyproject.toml | 95 ++++ d2d_development/tests/__init__.py | 0 d2d_development/tests/mock_dhis2_get.py | 238 +++++++++ d2d_development/tests/mock_dhis2_post.py | 484 ++++++++++++++++++ d2d_development/tests/test_data_point.py | 77 +++ d2d_development/tests/test_extract.py | 259 ++++++++++ d2d_development/tests/test_push.py | 463 +++++++++++++++++ d2d_development/tests/test_utils.py | 113 ++++ 17 files changed, 3628 insertions(+) create mode 100644 d2d_development/README.md create mode 100644 d2d_development/d2d_development/__init__.py create mode 100644 d2d_development/d2d_development/data_models.py create mode 100644 d2d_development/d2d_development/dataset_completion.py create mode 100644 d2d_development/d2d_development/exceptions.py create mode 100644 d2d_development/d2d_development/extract.py create mode 100644 d2d_development/d2d_development/org_unit_aligner.py create mode 100644 d2d_development/d2d_development/push.py create mode 100644 d2d_development/d2d_development/utils.py create mode 100644 d2d_development/pyproject.toml create mode 100644 d2d_development/tests/__init__.py create mode 100644 d2d_development/tests/mock_dhis2_get.py create mode 100644 d2d_development/tests/mock_dhis2_post.py create mode 100644 d2d_development/tests/test_data_point.py create mode 100644 d2d_development/tests/test_extract.py create mode 100644 d2d_development/tests/test_push.py create mode 100644 d2d_development/tests/test_utils.py diff --git a/d2d_development/README.md b/d2d_development/README.md new file mode 100644 index 0000000..6cd018d --- /dev/null +++ b/d2d_development/README.md @@ -0,0 +1,6 @@ +# openhexa-ds-developments +Development repo for DS team OpenHEXA utilities. + + +Install package : pip install git+https://github.com/BLSQ/openhexa-ds-developments.git#subdirectory=d2d_development + diff --git a/d2d_development/d2d_development/__init__.py b/d2d_development/d2d_development/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/d2d_development/d2d_development/data_models.py b/d2d_development/d2d_development/data_models.py new file mode 100644 index 0000000..2d0b344 --- /dev/null +++ b/d2d_development/d2d_development/data_models.py @@ -0,0 +1,228 @@ +import json +from dataclasses import dataclass +from enum import Enum +from typing import NamedTuple + +import pandas as pd + + +class DataType(Enum): + """Enumeration of supported DHIS2 data types for extraction.""" + + DATA_ELEMENT = "DATA_ELEMENT" + REPORTING_RATE = "REPORTING_RATE" + INDICATOR = "INDICATOR" + + +@dataclass +class DataPointModel: + """Data model representing a DHIS2 data point. + + Attributes + ---------- + dataElement : str + The unique identifier for the data element. + period : str + The reporting period for the data point. + orgUnit : str + The organizational unit associated with the data point. + categoryOptionCombo : str + The category option combination identifier. + attributeOptionCombo : str + The attribute option combination identifier. + value : float + The value of the data point. + """ + + dataElement: str # noqa: N815 + period: str + orgUnit: str # noqa: N815 + categoryOptionCombo: str # noqa: N815 + attributeOptionCombo: str # noqa: N815 + value: str + + def to_json(self) -> dict: + """Return a dictionary representation of the data point suitable for DHIS2 JSON format. + + Returns + ------- + dict + A dictionary with keys corresponding to DHIS2 data value fields. + """ + if self.value is None or (isinstance(self.value, str) and not self.value.strip()): + return { + "dataElement": self.dataElement, + "period": self.period, + "orgUnit": self.orgUnit, + "categoryOptionCombo": self.categoryOptionCombo, + "attributeOptionCombo": self.attributeOptionCombo, + "value": "", + "comment": "deleted value", + } + + return { + "dataElement": self.dataElement, + "period": self.period, + "orgUnit": self.orgUnit, + "categoryOptionCombo": self.categoryOptionCombo, + "attributeOptionCombo": self.attributeOptionCombo, + "value": self.value, + } + + def __str__(self) -> str: + return ( + f"DataPointModel(" + f"dataElement={self.dataElement}, " + f"period={self.period}, " + f"orgUnit={self.orgUnit}, " + f"categoryOptionCombo={self.categoryOptionCombo}, " + f"attributeOptionCombo={self.attributeOptionCombo}, " + f"value={self.value})" + ) + + +@dataclass +class OrgUnitModel: + """Helper object definition to represent an organizational unit.""" + + id: str + name: str + shortName: str # noqa: N815 + openingDate: str # noqa: N815 + closedDate: str # noqa: N815 + parent: dict + level: int + path: str + geometry: str + + +class OrgUnitRow(NamedTuple): + """Helper object definition to represent an organizational unit.""" + + id: str + name: str + shortName: str # noqa: N815 + openingDate: str # noqa: N815 + closedDate: str | None # noqa: N815 + parent: dict | None + level: int + path: str + geometry: str | dict | None + + +class OrgUnitObj: # noqa: PLW1641 (no hashing) + """Helper class definition to store/create the correct OrgUnit JSON format.""" + + def __init__(self, org_unit: OrgUnitRow | pd.Series | tuple): + """Create a new org unit instance. + + Parameters + ---------- + org_unit : OrgUnitRow | pd.Series + The organizational unit data. + Expects columns with names : + ['id', 'name', 'shortName', 'openingDate', 'closedDate', 'parent','level', 'path', 'geometry'] + """ + if isinstance(org_unit, pd.Series): + # Convert Series to OrgUnitRow + org_unit = OrgUnitRow( + id=org_unit["id"], + name=org_unit["name"], + shortName=org_unit["shortName"], + openingDate=org_unit["openingDate"], + closedDate=org_unit["closedDate"], + parent=org_unit["parent"], + level=org_unit["level"], + path=org_unit["path"], + geometry=org_unit["geometry"], + ) + elif isinstance(org_unit, tuple) and hasattr(org_unit, "_fields"): + org_unit = OrgUnitRow(**org_unit._asdict()) + elif not isinstance(org_unit, OrgUnitRow): + raise TypeError(f"Expected OrgUnitRow, pd.Series, or tuple, got {type(org_unit)}") + + self.initialize_from(org_unit_tuple=org_unit) + + def initialize_from(self, org_unit_tuple: OrgUnitRow): + """Initialize the OrgUnitObj instance from an OrgUnitRow tuple. + + This object should represent a DHIS2 organizational unit with the same attribute naming. + + Parameters + ---------- + org_unit_tuple : tuple + A tuple containing organizational unit attributes. + """ + # Keep names consistent + self.id = org_unit_tuple.id + self.name = org_unit_tuple.name + self.shortName = org_unit_tuple.shortName + self.openingDate = org_unit_tuple.openingDate + self.closedDate = org_unit_tuple.closedDate + self.parent = org_unit_tuple.parent + # Parse geometry safely + geometry = org_unit_tuple.geometry + if pd.notna(geometry): + if isinstance(geometry, str): + try: + self.geometry = json.loads(geometry) + except json.JSONDecodeError: + self.geometry = None + else: + self.geometry = geometry + else: + self.geometry = None + + def to_json(self) -> dict: + """Return a dictionary representation of the organizational unit suitable for DHIS2 API. + + Returns + ------- + dict + Dictionary containing the organizational unit's attributes formatted for DHIS2. + """ + json_dict = { + "id": self.id, + "name": self.name, + "shortName": self.shortName, + "openingDate": self.openingDate, + } + + if pd.notna(self.closedDate): + json_dict["closedDate"] = self.closedDate + + if self.parent and self.parent.get("id") and pd.notna(self.parent.get("id")): + json_dict["parent"] = {"id": self.parent.get("id")} + + if self.geometry and pd.notna(self.geometry): + json_dict["geometry"] = { + "type": self.geometry["type"], + "coordinates": self.geometry["coordinates"], + } + return json_dict + + def is_valid(self) -> bool: + """Check if the OrgUnitObj instance has all required attributes set. + + Returns + ------- + bool + True if all required attributes are not None, False otherwise. + """ + return pd.notna(self.id) and pd.notna(self.name) and pd.notna(self.shortName) and pd.notna(self.openingDate) + + def __str__(self) -> str: + return f"OrgUnitObj({self.id}, {self.name})" + + def __eq__(self, other: object) -> bool: + if not isinstance(other, OrgUnitObj): + return NotImplemented + return ( + self.id == other.id + and self.name == other.name + and self.shortName == other.shortName + and self.openingDate == other.openingDate + and self.closedDate == other.closedDate + and self.parent == other.parent + and self.geometry == other.geometry + ) diff --git a/d2d_development/d2d_development/dataset_completion.py b/d2d_development/d2d_development/dataset_completion.py new file mode 100644 index 0000000..2c85dac --- /dev/null +++ b/d2d_development/d2d_development/dataset_completion.py @@ -0,0 +1,422 @@ +import json +import logging +from pathlib import Path + +import pandas as pd +import requests +from openhexa.sdk import current_run +from openhexa.toolbox.dhis2 import DHIS2 + + +class DHIS2ImportError(RuntimeError): + """Custom exception for DHIS2 import errors.""" + + pass + + +class DatasetCompletionSync: + """Main class to handle pushing data to DHIS2. + + ATTENTION: This syncer assumes the source and target DHIS2 instances + have the same organisation units configured. + """ + + def __init__( + self, + source_dhis2: DHIS2, + target_dhis2: DHIS2, + import_strategy: str = "CREATE_AND_UPDATE", + dry_run: bool = True, + logger: logging.Logger | None = None, + ): + self.source_dhis2 = source_dhis2 + self.target_dhis2 = target_dhis2 + if import_strategy not in {"CREATE", "UPDATE", "CREATE_AND_UPDATE"}: + raise ValueError("Invalid import strategy (use 'CREATE', 'UPDATE' or 'CREATE_AND_UPDATE')") + self.import_strategy = import_strategy + self.dry_run = dry_run + self.import_summary = { + "import_counts": {"imported": 0, "updated": 0, "ignored": 0, "deleted": 0}, + "errors": {"fetch_errors": 0, "no_completion": 0, "push_errors": 0}, + } + self.completion_table = pd.DataFrame() + self.logger = logger if logger else logging.getLogger(__name__) + + def _fetch_completion_status_from_source( + self, + dataset_id: str, + period: str, + org_unit: str, + children: bool = True, + timeout: int = 5, + ) -> list[dict]: + """Fetch completion status from source DHIS2. + + Args: + dataset_id: The dataset ID to fetch completion status for. + period: The period for which to fetch the completion status. + org_unit: The organisation unit to fetch completion status for. + children: Whether to include child org units in the fetch. + timeout: Timeout for the request in seconds. + + Returns: + list[dict]: A list of completion status dictionaries from the DHIS2 API. + Returns an empty list if the request fails or no data is found. + """ + endpoint = f"{self.source_dhis2.api.url}/completeDataSetRegistrations" + params = { + "period": period, + "orgUnit": org_unit, + "children": "true" if children else "false", + "dataSet": dataset_id, + } + + try: + response = self.source_dhis2.api.session.get(endpoint, params=params, timeout=timeout) + response.raise_for_status() # raise exception for HTTP errors + try: + completion = response.json().get("completeDataSetRegistrations", []) + except ValueError as e: + self.import_summary["errors"]["fetch_errors"] += 1 + self.logger.error(f"Invalid JSON from {endpoint} for ds:{dataset_id} pe:{period} ou:{org_unit}: {e}") + return [] + if not completion and not children: + self.import_summary["errors"]["no_completion"] += 1 + self.logger.info( + f"No completion status found at source for ds: {dataset_id} pe: {period} ou: {org_unit}" + ) + return completion if completion else [] + except requests.RequestException as e: + self.import_summary["errors"]["fetch_errors"] += 1 + self.logger.error( + f"GET request to {self.source_dhis2.api.url} failed to retrieve completion status for " + f"ds: {dataset_id} pe: {period} ou: {org_unit} failed : {e}" + ) + return [] + + def _push_completion_status_to_target( + self, + dataset_id: str, + period: str, + org_unit: str, + date: str, + completed: bool, + timeout: int = 5, + ) -> None: + """Perform a PUT request (or POST with importStrategy) to a DHIS2 API endpoint. + + Args: + dataset_id: The dataset ID to push completion status for. + period: The period for which to push the completion status. + org_unit: The organisation unit to push completion status for. + date: The date of completion. + completed: Whether the dataset is marked as completed. + timeout: Timeout for the request in seconds. + + Raises: + requests.HTTPError if the request fails after retries. + """ + endpoint = f"{self.target_dhis2.api.url}/completeDataSetRegistrations" + payload = { + "completeDataSetRegistrations": [ + { + "organisationUnit": org_unit, + "period": period, + "completed": completed, + "date": date, + "dataSet": dataset_id, + } + ] + } + params = { + "dryRun": str(self.dry_run).lower(), + "importStrategy": self.import_strategy, + "preheatCache": True, + "skipAudit": True, + "reportMode": "FULL", + } + + response = None + try: + response = self.target_dhis2.api.session.post(endpoint, json=payload, params=params, timeout=timeout) + response.raise_for_status() + except requests.RequestException: + # avoid doube counting errors in summary + # self.import_summary["errors"]["push_errors"] += 1 + raise + finally: + self._process_response(ds=dataset_id, pe=period, ou=org_unit, response=response) + + def _try_build_source_completion_table(self, org_units: list[str], dataset_id: str, period: str) -> None: + """Build a completion status table for all organisation units provided. + + Args: + org_units: List of organisation unit IDs to fetch completion status for (NOTE: use OU parents). + dataset_id: The dataset ID to fetch completion status for. + period: The period for which to fetch the completion status. + """ + if not org_units: + return + + completion_statuses = [] + for ou in org_units: + completion = self._fetch_completion_status_from_source( + dataset_id=dataset_id, period=period, org_unit=ou, children=True, timeout=30 + ) + if completion: + completion_statuses.extend(completion) + + self.completion_table = pd.DataFrame(completion_statuses) + + def _get_source_completion_status_for_ou(self, dataset_id: str, period: str, org_unit: str) -> dict | None: + """Handle fetching completion status for a specific org unit. + + Returns: + list: The completion status as dictionaries for the specified org unit (children) if found, otherwise []. + """ + if not self.completion_table.empty: + completion_status = self.completion_table[self.completion_table["organisationUnit"] == org_unit] + if not completion_status.empty: + return completion_status.iloc[0].to_dict() + + results = self._fetch_completion_status_from_source( + dataset_id=dataset_id, period=period, org_unit=org_unit, children=False + ) + for item in results or []: + if item.get("organisationUnit") == org_unit: + return item + + return None + + def sync( + self, + source_dataset_id: str, + target_dataset_id: str, + org_units: list[str] | None, + parent_ou: list[str] | None, + period: list[str], + logging_interval: int = 2000, + ds_processed_path: Path | None = None, + mark_uncompleted_as_processed: bool = False, + ) -> None: + """Sync completion status between datasets. + + source_dataset_id: The dataset ID in the source DHIS2 instance. + target_dataset_id: The dataset ID in the target DHIS2 instance. + org_units: List of organisation unit IDs to sync. + parent_ou: List of parent organisation unit IDs to build completion table (if None, no table built). + period: The period for which to sync the completion status. + logging_interval: Interval for logging progress (defaults to 2000). + ds_processed_path: Path to save processed org units (if None, no file saving nor comparison). + mark_uncompleted_as_processed: If True, org units with no completion status will be marked as processed. + """ + self.reset_import_summary() + + if not org_units: + msg = f"No org units provided for period {period}. DS sync skipped." + self.logger.warning(msg) + current_run.log_warning(msg) + return + + org_units_to_process = self._get_unprocessed_org_units(org_units, ds_processed_path, period) + if not org_units_to_process: + msg = f"All org units already processed for period {period}. DS sync skipped." + self.logger.info(msg) + current_run.log_info(msg) + return + + msg = ( + f"Starting dataset '{target_dataset_id}' completion process for period: " + f"{period} org units: {len(org_units_to_process)}." + ) + current_run.log_info(msg) + self.logger.info(msg) + + self._try_build_source_completion_table(org_units=parent_ou, dataset_id=source_dataset_id, period=period) + + try: + processed = [] + for idx, ou in enumerate(org_units_to_process, start=1): + completion_status = self._get_source_completion_status_for_ou( + dataset_id=source_dataset_id, + period=period, + org_unit=ou, + ) + + if not completion_status: + if mark_uncompleted_as_processed: + processed.append(ou) # if True, empty completion -> mark as processed + continue + + if "date" not in completion_status or "completed" not in completion_status: + self.import_summary["errors"]["push_errors"] += 1 + self.logger.error( + f"Missing keys in completion status for period {period}, org unit {ou}: {completion_status}" + ) + continue + + try: + self._push_completion_status_to_target( + dataset_id=target_dataset_id, + period=period, + org_unit=ou, + date=completion_status.get("date"), + completed=completion_status.get("completed"), + ) + processed.append(ou) + except Exception as e: + self.logger.error(f"Error pushing completion status for period {period}, org unit {ou}: {e}") + + if idx % logging_interval == 0 or idx == len(org_units_to_process): + current_run.log_info(f"{idx} / {len(org_units_to_process)} OUs processed") + self._update_processed_ds_sync_file( + processed=processed, + period=period, + processed_path=ds_processed_path, + ) + except Exception as e: + self.logger.error(f"Error setting dataset completion for dataset {target_dataset_id}, period {period}: {e}") + finally: + self._log_summary(org_units=org_units_to_process, period=period) + + def _get_unprocessed_org_units(self, org_units: list, processed_path: Path | None, period: str) -> list: + if processed_path is None: + return org_units + ds_processed_fname = processed_path / f"ds_ou_processed_{period}.parquet" + if not ds_processed_fname.exists(): + return org_units + + try: + processed_df = pd.read_parquet(ds_processed_fname) + if "ORG_UNIT" not in processed_df.columns: + raise KeyError("Missing ORG_UNIT column") + + processed_set = set(processed_df["ORG_UNIT"].dropna().unique()) + remaining = [ou for ou in org_units if ou not in processed_set] + + msg = f"Loaded {len(processed_set)} processed org units, {len(remaining)} to process for period {period}." + self.logger.info(msg) + current_run.log_info(msg) + return remaining + except Exception as e: + msg = f"Error loading processed org units file: {ds_processed_fname}. Returning all org units to process." + self.logger.error(msg + f" Error: {e}") + current_run.log_info(msg) + return org_units + + def _update_processed_ds_sync_file( + self, + processed: list, + period: str, + processed_path: Path | None, + ) -> None: + """Save the processed org units to a parquet file.""" + if processed_path is None: + current_run.log_warning("No processed path provided, skipping saving processed org units.") + return + + processed_path.mkdir(parents=True, exist_ok=True) + ds_processed_file = processed_path / f"ds_ou_processed_{period}.parquet" + + msg = None + final_processed = processed + + if ds_processed_file.exists(): + existing_df = pd.read_parquet(ds_processed_file) + existing_org_units = set(existing_df["ORG_UNIT"].unique()) + new_org_units = [ou for ou in processed if ou not in existing_org_units] + final_processed = list(existing_org_units) + new_org_units + msg = ( + f"Found {len(existing_org_units)} processed OUs, " + f"updating file {ds_processed_file.name} with {len(new_org_units)} new OUs." + ) + + if final_processed: + df_processed = pd.DataFrame({"ORG_UNIT": final_processed}) + df_processed.to_parquet(ds_processed_file, index=False) + msg = f"Saved {len(final_processed)} processed org units in {ds_processed_file.name}." + + if msg: + current_run.log_info(msg) + self.logger.info(msg) + + def _log_summary(self, org_units: list, period: str) -> None: + msg = ( + f"Dataset completion period {period} summary: {self.import_summary['import_counts']} " + f"total org units: {len(org_units)} " + ) + current_run.log_info(msg) + self.logger.info(msg) + + if self.import_summary["errors"]["no_completion"] > 0: + msg = ( + f"{self.import_summary['errors']['no_completion']} out of " + f"{len(org_units)} completion statuses failed to be retrieved from source." + ) + current_run.log_warning(msg) + self.logger.warning(msg) + + if self.import_summary["errors"]["fetch_errors"] > 0: + msg = ( + f"{self.import_summary['errors']['fetch_errors']} out of " + f"{len(org_units)} completion statuses failed to fetch." + ) + current_run.log_warning(msg) + self.logger.warning(msg) + + if self.import_summary["errors"]["push_errors"] > 0: + msg = ( + f"{self.import_summary['errors']['push_errors']} " + f"out of {len(org_units)} completion statuses failed to push." + ) + current_run.log_warning(msg) + self.logger.warning(msg) + + def _process_response(self, ds: str, pe: str, ou: str, response: dict) -> None: + """Log the response from the DHIS2 API after pushing completion status.""" + json_or_none = self._safe_json(response) + if not json_or_none: + self.import_summary["errors"]["push_errors"] += 1 + self.logger.error( + f"No JSON response received for completion request ds: {ds} pe: {pe} ou: {ou} from DHIS2 API." + ) + raise DHIS2ImportError("Empty or invalid JSON response from DHIS2") + + conflicts: list[str] = json_or_none.get("conflicts", {}) + status = json_or_none.get("status") + if status in ["ERROR", "WARNING"] or conflicts: + for conflict in conflicts: + self.import_summary["errors"]["push_errors"] += 1 + self.logger.error( + f"Conflict pushing completion for ds: {ds} pe: {pe} ou: {ou} status: {status} - {conflict}" + ) + self._update_import_summary(response=json_or_none) + raise DHIS2ImportError( + f"DHIS2 completion push failed with status={status} " + f"and {len(conflicts)} conflict(s) for ds:{ds} pe:{pe} ou:{ou}" + ) + + if status == "SUCCESS": + self.logger.info(f"Successfully pushed to target completion ds: {ds} pe:{pe} ou: {ou}") + self._update_import_summary(response=json_or_none) + + def _safe_json(self, r: requests.Response) -> dict | None: + if r is None: + return None + try: + return r.json() + except (ValueError, json.JSONDecodeError): + return None + + def _update_import_summary(self, response: dict) -> None: + if response: + import_counts = response.get("importCount", {}) + for key in ["imported", "updated", "ignored", "deleted"]: + self.import_summary["import_counts"][key] += import_counts.get(key, 0) + + def reset_import_summary(self) -> None: + """Reset the import summary to its initial state.""" + self.import_summary = { + "import_counts": {"imported": 0, "updated": 0, "ignored": 0, "deleted": 0}, + "errors": {"fetch_errors": 0, "no_completion": 0, "push_errors": 0}, + } diff --git a/d2d_development/d2d_development/exceptions.py b/d2d_development/d2d_development/exceptions.py new file mode 100644 index 0000000..28096b9 --- /dev/null +++ b/d2d_development/d2d_development/exceptions.py @@ -0,0 +1,6 @@ +class ExtractorError(Exception): + """Custom exception for all DHIS2Extractor errors.""" + + +class PusherError(Exception): + """Custom exception for all DHIS2Pusher errors.""" diff --git a/d2d_development/d2d_development/extract.py b/d2d_development/d2d_development/extract.py new file mode 100644 index 0000000..2937377 --- /dev/null +++ b/d2d_development/d2d_development/extract.py @@ -0,0 +1,462 @@ +import logging +from pathlib import Path + +import polars as pl +from openhexa.toolbox.dhis2 import DHIS2 + +from .data_models import DataType +from .exceptions import ExtractorError +from .utils import log_message, save_to_parquet + +# TODO: +# 1) Refactor the extractors to (Following DHIS2 client endpoints): +# -DataValueSetsExtractor (DE) +# -AnalyticsExtractor (DE, indicators, ReportingRates) + + +class DataElementsExtractor: + """Handles downloading and formatting of data elements from DHIS2.""" + + def __init__(self, extractor: "DHIS2Extractor"): + self.extractor = extractor + + def download_period( + self, + data_elements: list[str], + org_units: list[str], + period: str, + output_dir: Path, + filename: str | None = None, + **kwargs, # noqa: ANN003 + ) -> Path | None: + """Download and handle data extracts for the specified period, saving them to the output directory. + + Parameters + ---------- + data_elements : list[str] + List of DHIS2 data element UIDs to extract. + org_units : list[str] + List of DHIS2 organization unit UIDs to extract data for. + period : str + DHIS2 period (valid format) to extract data for. + output_dir : Path + Directory where extracted data files will be saved. + filename : str | None + Optional filename for the extracted data file. If None, a default name will be used. + kwargs : dict + Additional keyword arguments for data retrieval, such as `last_updated` for filtering data. + + Returns + ------- + Path | None + The path to the extracted data file, or None if no data was extracted. + + Raises + ------ + ExtractorError + If an error occurs during the extract process. + """ + try: + self.extractor._log_message(f"Retrieving data elements extract for period : {period}") + return self.extractor._handle_extract_for_period( + handler=self, + data_products=data_elements, + org_units=org_units, + period=period, + output_dir=output_dir, + filename=filename, + **kwargs, + ) + except Exception as e: + self.extractor._log_message( + "Extract data elements download error.", log_current_run=False, error_details=str(e), level="error" + ) + raise ExtractorError(f"Extract data elements download error : {e}") from e + + def _retrieve_data(self, data_elements: list[str], org_units: list[str], period: str, **kwargs) -> pl.DataFrame: # noqa: ANN003 + if not self.extractor._valid_dhis2_period_format(period): + raise ExtractorError(f"Invalid DHIS2 period format: {period}") + last_updated = kwargs.get("last_updated") + try: + response = self.extractor.dhis2_client.data_value_sets.get( + data_elements=data_elements, + periods=[period], + org_units=org_units, + last_updated=last_updated, # not implemented yet + ) + except Exception as e: + msg = "Error retrieving data elements data" + self.extractor._log_message(msg, log_current_run=False, error_details=str(e), level="error") + raise ExtractorError(msg) from e + + return self.extractor._map_to_dhis2_format(pl.DataFrame(response), data_type=DataType.DATA_ELEMENT) + + +class IndicatorsExtractor: + """Handles downloading and formatting of indicators from DHIS2.""" + + def __init__(self, extractor: "DHIS2Extractor"): + self.extractor = extractor + + def download_period( + self, + indicators: list[str], + org_units: list[str], + period: str, + output_dir: Path, + filename: str | None = None, + **kwargs, # noqa: ANN003 + ) -> Path | None: + """Download and handle data extracts for the specified periods, saving them to the output directory. + + Parameters + ---------- + indicators : list[str] + List of DHIS2 indicators UIDs to extract. + org_units : list[str] + List of DHIS2 organization unit UIDs to extract data for. + period : str + DHIS2 period (valid format) to extract data for. + output_dir : Path + Directory where extracted data files will be saved. + filename : str | None + Optional filename for the extracted data file. If None, a default name will be used. + kwargs : dict + Additional keyword arguments for data retrieval from analytics like: + -include_cocs: bool, whether to include category option combo mapping for indicators. + -last_updated: datetime, not implemented yet, placeholder for future use to filter data + based on last updated timestamp. + + Returns + ------- + Path | None + The path to the extracted data file, or None if no data was extracted. + + Raises + ------ + ExtractorError + If an error occurs during the extract process. + """ + try: + self.extractor._log_message(f"Retrieving indicators extract for period : {period}") + return self.extractor._handle_extract_for_period( + handler=self, + data_products=indicators, + org_units=org_units, + period=period, + output_dir=output_dir, + filename=filename, + **kwargs, + ) + except Exception as e: + self.extractor._log_message( + "Extract indicators download error.", log_current_run=False, error_details=str(e), level="error" + ) + raise ExtractorError(f"Extract indicators download error : {e}") from e + + def _retrieve_data(self, indicators: list[str], org_units: list[str], period: str, **kwargs) -> pl.DataFrame: # noqa: ANN003 + if not self.extractor._valid_dhis2_period_format(period): + raise ExtractorError(f"Invalid DHIS2 period format: {period}") + + # NOTE: This option is usefull to retrieve data Elements using the analytics endpoint. + include_cocs = kwargs.get("include_cocs", False) + try: + response = self.extractor.dhis2_client.analytics.get( + indicators=indicators, + periods=[period], + org_units=org_units, + include_cocs=include_cocs, + ) + except Exception as e: + msg = "Error retrieving indicators data" + self.extractor._log_message(msg, log_current_run=False, error_details=str(e), level="error") + raise ExtractorError(msg) from e + + raw_data_formatted = pl.DataFrame(response).rename({"pe": "period", "ou": "orgUnit"}) + if "co" in raw_data_formatted.columns: + raw_data_formatted = raw_data_formatted.rename({"co": "categoryOptionCombo"}) + return self.extractor._map_to_dhis2_format( + raw_data_formatted, data_type=DataType.INDICATOR, map_cocs=include_cocs + ) + + +class ReportingRatesExtractor: + """Handles downloading and formatting of reporting rates from DHIS2.""" + + def __init__(self, extractor: "DHIS2Extractor"): + self.extractor = extractor + + def download_period( + self, + reporting_rates: list[str], + org_units: list[str], + period: str, + output_dir: Path, + filename: str | None = None, + **kwargs, # noqa: ANN003 + ) -> Path | None: + """Download and handle data extracts for the specified periods, saving them to the output directory. + + Parameters + ---------- + reporting_rates : list[str] + List of DHIS2 reporting rates UIDs.RATE to extract. + org_units : list[str] + List of DHIS2 organization unit UIDs to extract data for. + period : str + DHIS2 period (valid format) to extract data for. + output_dir : Path + Directory where extracted data files will be saved. + filename : str | None + Optional filename for the extracted data file. If None, a default name will be used. + kwargs : dict + Additional keyword arguments for data retrieval, such as `last_updated` for filtering data. + + Returns + ------- + Path | None + The path to the extracted data file, or None if no data was extracted. + + Raises + ------ + ExtractorError + If an error occurs during the extract process. + """ + try: + self.extractor._log_message(f"Retrieving reporting rates extract for period : {period}") + return self.extractor._handle_extract_for_period( + handler=self, + data_products=reporting_rates, + org_units=org_units, + period=period, + output_dir=output_dir, + filename=filename, + **kwargs, + ) + except Exception as e: + self.extractor._log_message( + "Extract reporting rates download error.", log_current_run=False, error_details=str(e), level="error" + ) + raise ExtractorError(f"Extract reporting rates download error : {e}") from e + + def _retrieve_data(self, reporting_rates: list[str], org_units: list[str], period: str, **kwargs) -> pl.DataFrame: # noqa: ANN003 + if not self.extractor._valid_dhis2_period_format(period): + raise ExtractorError(f"Invalid DHIS2 period format: {period}") + + try: + response = self.extractor.dhis2_client.analytics.get( + data_elements=reporting_rates, + periods=[period], + org_units=org_units, + include_cocs=False, # avoid client error + ) + except Exception as e: + msg = "Error retrieving reporting rates data" + self.extractor._log_message(msg, log_current_run=False, error_details=str(e), level="error") + raise ExtractorError(msg) from e + + raw_data_formatted = pl.DataFrame(response).rename({"pe": "period", "ou": "orgUnit"}) + return self.extractor._map_to_dhis2_format(raw_data_formatted, data_type=DataType.REPORTING_RATE) + + +class DHIS2Extractor: + """Extracts data from DHIS2 using various handlers for data elements, indicators, and reporting rates. + + Attributes + ---------- + client : object + The DHIS2 client used for data extraction. + queue : object | None + Optional queue for managing extracted files. + download_mode : str + Mode for downloading files ("DOWNLOAD_REPLACE" or "DOWNLOAD_NEW"). + last_updated : None + Placeholder for future use. + return_existing_file : bool + When DOWNLOAD_NEW mode is used: + True: returns the path to existing files. + False: returns None if the file already exists. + Default is False. + + Handlers + -------- + data_elements : DataElementsExtractor + Handler for extracting data elements. + indicators : IndicatorsExtractor + Handler for extracting indicators. + reporting_rates : ReportingRatesExtractor + Handler for extracting reporting rates. + """ + + def __init__( + self, + dhis2_client: DHIS2, + download_mode: str = "DOWNLOAD_REPLACE", + return_existing_file: bool = False, + logger: logging.Logger | None = None, + ): + self.dhis2_client = dhis2_client + if download_mode not in {"DOWNLOAD_REPLACE", "DOWNLOAD_NEW"}: + raise ExtractorError("Invalid 'download_mode', use 'DOWNLOAD_REPLACE' or 'DOWNLOAD_NEW'.") + self.download_mode = download_mode + self.last_updated = None # NOTE: Placeholder for future use + self.data_elements = DataElementsExtractor(self) + self.indicators = IndicatorsExtractor(self) + self.reporting_rates = ReportingRatesExtractor(self) + self.return_existing_file = return_existing_file + self.logger = logger or logging.getLogger(__name__) + self.log_function = log_message + + def _handle_extract_for_period( + self, + handler: DataElementsExtractor | IndicatorsExtractor | ReportingRatesExtractor, + data_products: list[str], + org_units: list[str], + period: str, + output_dir: Path, + filename: str | None = None, + **kwargs, # noqa: ANN003 + ) -> Path | None: + output_dir.mkdir(parents=True, exist_ok=True) + if filename: + extract_fname = output_dir / filename + else: + extract_fname = output_dir / f"data_{period}.parquet" + + # Skip if already exists and mode is DOWNLOAD_NEW + if self.download_mode == "DOWNLOAD_NEW" and extract_fname.exists(): + self._log_message(f"Extract for period {period} already exists, download skipped.") + return extract_fname if self.return_existing_file else None + + raw_data = handler._retrieve_data(data_products, org_units, period, **kwargs) + + if raw_data is None: + self._log_message(f"Nothing to save for period {period}.") + return None + + if extract_fname.exists(): + self._log_message(f"Replacing extract for period {period}.") + + save_to_parquet(raw_data, extract_fname) + return extract_fname + + def _map_to_dhis2_format( + self, + dhis_data: pl.DataFrame, + data_type: DataType = DataType.DATA_ELEMENT, + domain_type: str = "AGGREGATED", + map_cocs: bool = False, + ) -> pl.DataFrame: + """Maps DHIS2 data to a standardized data extraction table. + + Parameters + ---------- + dhis_data : pd.DataFrame + Input DataFrame containing DHIS2 data. Must include columns like `period`, `orgUnit`, + `categoryOptionCombo(DATA_ELEMENT)`, `attributeOptionCombo(DATA_ELEMENT)`, `dataElement` + and `value` based on the data type. + data_type : str + The type of data being mapped. Supported values are: + - "DATA_ELEMENT": Includes `categoryOptionCombo` and maps `dataElement` to `dx`. + - "INDICATOR": Maps `dx` to `dx`. + - "REPORTING_RATE": Maps `dx` to `dx` and `rateType` by split the string by `.`. + Default is "DATA_ELEMENT". + domain_type : str, optional + The domain of the data if its per period (Agg ex: monthly) or datapoint (Tracker ex: per day): + - "AGGREGATED": For aggregated data (default). + - "TRACKER": For tracker data. + **NOTE: THIS IS WORK IN PROGRESS AND NOT USED YET** + map_cocs : bool, optional + NOTE: IndicatorsExtractor can be used to retrieve data elements by passing valid data element ids + to the indicators parameter. Therefore we can use the client flag `include_coc` to include `co` column. + *Only applicable if `dataType` is "INDICATOR". Default is False. + + Returns + ------- + pl.DataFrame + A DataFrame formatted to SNIS standards, with the following columns (snake_case): + - "dataType": The type of data (DATA_ELEMENT, REPORTING_RATE, or INDICATOR). + - "dx": Data element, dataset, or indicator UID. + - "period": Reporting period. + - "orgUnit": Organization unit. + - "categoryOptionCombo": (Only for DATA_ELEMENT) Category option combo UID. + - "attributeOptionCombo": (Only for DATA_ELEMENT) Attribute option combo UID. + - "rateMetric": (Only for REPORTING_RATE) Rate metric. + - "domainType": Data domain (AGGREGATED or TRACKER). + - "value": Data value. + """ + if dhis_data.height == 0: + return None + + if data_type not in DataType: + raise ExtractorError( + "Incorrect 'data_type' configuration use: " + "(DataType.DATA_ELEMENT, DataType.REPORTING_RATE, DataType.INDICATOR)." + ) + + try: + n = dhis_data.height + data = { + "dataType": [data_type.value] * n, + "dx": None, + "period": dhis_data["period"] if "period" in dhis_data.columns else None, + "orgUnit": dhis_data["orgUnit"] if "orgUnit" in dhis_data.columns else None, + "categoryOptionCombo": None, + "attributeOptionCombo": None, + "rateMetric": None, + "domainType": [domain_type] * n, + "value": dhis_data["value"] if "value" in dhis_data.columns else None, + } + if data_type == DataType.DATA_ELEMENT: + data["dx"] = dhis_data["dataElement"] if "dataElement" in dhis_data.columns else None + data["categoryOptionCombo"] = ( + dhis_data["categoryOptionCombo"] if "categoryOptionCombo" in dhis_data.columns else None + ) + data["attributeOptionCombo"] = ( + dhis_data["attributeOptionCombo"] if "attributeOptionCombo" in dhis_data.columns else None + ) + elif data_type == DataType.REPORTING_RATE: + if "dx" in dhis_data.columns: + split = dhis_data["dx"].str.split_exact(".", 1) + data["dx"] = split.struct.field("field_0") + data["rateMetric"] = split.struct.field("field_1") + elif data_type == DataType.INDICATOR: + data["dx"] = dhis_data["dx"] if "dx" in dhis_data.columns else None + if map_cocs and "categoryOptionCombo" in dhis_data.columns: + data["categoryOptionCombo"] = dhis_data["categoryOptionCombo"] + return pl.DataFrame(data) + + except AttributeError as e: + msg = ( + f"Failed to map DHIS2 data to the expected format. " + f"Input columns: {list(dhis_data.columns)}. " + f"Expected columns depend on data_type: {data_type}." + ) + self._log_message(msg, log_current_run=False, error_details=f"AttributeError: {e}", level="error") + raise ExtractorError(msg) from e + except Exception as e: + msg = "Unexpected error while mapping DHIS2 data" + self._log_message(msg, log_current_run=False, error_details=f"{type(e).__name__}: {e}", level="error") + raise ExtractorError(msg) from e + + def _log_message(self, message: str, level: str = "info", log_current_run: bool = True, error_details: str = ""): + """Log a message using the configured logging function.""" + self.log_function( + logger=self.logger, + message=message, + error_details=error_details, + level=level, + log_current_run=log_current_run, + exception_class=ExtractorError, + ) + + def _valid_dhis2_period_format(self, dhis2_period: str) -> bool: + """Validate if the given period string is in a valid DHIS2 format. + + Returns + ------- + bool + True if valid, False otherwise. + """ + # TODO: Expand this function to cover more DHIS2 period formats as needed + return True diff --git a/d2d_development/d2d_development/org_unit_aligner.py b/d2d_development/d2d_development/org_unit_aligner.py new file mode 100644 index 0000000..aa0e576 --- /dev/null +++ b/d2d_development/d2d_development/org_unit_aligner.py @@ -0,0 +1,351 @@ +import json +import logging + +import pandas as pd +import requests +from openhexa.sdk import current_run +from openhexa.toolbox.dhis2 import DHIS2 +from packaging import version +from requests import Response +from requests.structures import CaseInsensitiveDict + +from .data_models import OrgUnitObj + + +class OrgUnitCreateError(RuntimeError): + """Custom error for organisation unit create failures.""" + + pass + + +class OrgUnitUpdateError(RuntimeError): + """Custom error for organisation unit update failures.""" + + pass + + +class DHIS2PyramidAligner: + """Align organisation units between two DHIS2 instances. + + This class is stateless and provides methods to synchronize organisation units + from a source DHIS2 instance to a target DHIS2 instance. The alignment process + compares the pyramids of both instances and performs the necessary operations + to keep the target up to date with the source. + + Supported operations include: + - Creating organisation units that exist in the source but not in the target. + - Updating organisation units that exist in both but differ in their attributes. + + This class does not store any state between calls; all data must be provided + as method parameters. + """ + + def __init__(self, logger: logging.Logger): + self.logger = logger if logger else logging.getLogger(__name__) + self._initialize_summary() + + def _initialize_summary(self): + self.summary = { + "CREATE": {"CREATE_COUNT": 0, "CREATE_DETAILS": [], "ERROR_COUNT": 0, "ERROR_DETAILS": []}, + "UPDATE": {"UPDATE_COUNT": 0, "UPDATE_DETAILS": [], "ERROR_COUNT": 0, "ERROR_DETAILS": []}, + "INVALID": {"INVALID_COUNT": 0, "INVALID_DETAILS": []}, + } + + def align_to( + self, + target_dhis2: DHIS2, + source_pyramid: pd.DataFrame, + dry_run: bool = True, + ): + """Syncs the extracted pyramid data with the target DHIS2 instance.""" + # Load the target pyramid + if source_pyramid.empty: + self._log_message("Source pyramid is empty. Organisation units alignment skipped.", level="warning") + return + + self._initialize_summary() + + self._log_message(f"Retrieving organisation units from target DHIS2: {target_dhis2.api.url}") + # Retrieve all organisation units from the target DHIS2 + target_pyramid = target_dhis2.meta.organisation_units( + fields="id,name,shortName,openingDate,closedDate,parent,level,path,geometry" + ) + target_pyramid = pd.DataFrame(target_pyramid) + self._log_message(f"Shape target pyramid: {target_pyramid.shape} - dry_run: {dry_run}") + + # Select new OU: all OU in source not in target (set difference) + ou_new = list(set(source_pyramid.id) - set(target_pyramid.id)) + ou_to_create = source_pyramid[source_pyramid.id.isin(ou_new)] + self._push_org_units_create( + ou_to_create=ou_to_create, + target_dhis2=target_dhis2, + dry_run=dry_run, + ) + + # Select matching OU: all OU uid that match between DHIS2 source and target (set intersection) + matching_ou_ids = list(set(source_pyramid.id).intersection(set(target_pyramid.id))) + self._push_org_units_update( + org_unit_source=source_pyramid, + org_unit_target=target_pyramid, + ou_ids_to_check=matching_ou_ids, + target_dhis2=target_dhis2, + dry_run=dry_run, + ) + + def _push_org_units_create(self, ou_to_create: pd.DataFrame, target_dhis2: DHIS2, dry_run: bool) -> None: + """Create organisation units in the target DHIS2 instance. + + Parameters + ---------- + ou_to_create : pd.DataFrame + DataFrame containing organisation unit data to be created. + target_dhis2 : DHIS2 + DHIS2 client for the target instance. + dry_run : bool + If True, performs a dry run without making changes. + + This function iterates over the organisation units, validates them, and + attempts to create them in the target DHIS2. + Logs errors and information about the creation process. + """ + if not ou_to_create.shape[0] > 0: + self._log_message("No new organisation units to create.") + return + + try: + # NOTE: Geometry is valid for versions > 2.32 + if version.parse(target_dhis2.version) <= version.parse("2.32"): + ou_to_create["geometry"] = None + self._log_message( + "DHIS2 version not compatible with geometry. Geometry will not be pushed.", level="warning" + ) + + self._log_message(f"Creating {len(ou_to_create)} organisation units.") + for row_tuple in ou_to_create.itertuples(index=False, name="OrgUnitRow"): + ou = OrgUnitObj(row_tuple) + if ou.is_valid(): + response = self._push_org_unit( + dhis2_client=target_dhis2, + org_unit=ou, + strategy="CREATE", + dry_run=dry_run, # dry_run=False -> Apply changes in the DHIS2 + ) + if response.get("status") not in ("SUCCESS", "OK"): + self.summary["CREATE"]["ERROR_COUNT"] += 1 + self.summary["CREATE"]["ERROR_DETAILS"].append(response) + self.logger.error(str(response)) + else: + created_ou = {"ACTION": "CREATE", "OU": str(ou.to_json()), "RESPONSE": response} + self.summary["CREATE"]["CREATE_COUNT"] += 1 + self.summary["CREATE"]["CREATE_DETAILS"].append(created_ou) + self.logger.info(created_ou) + else: + invalid_ou = {"ACTION": "CREATE", "STATUS": "INVALID", "OU": str(ou.to_json())} + self.summary["INVALID"]["INVALID_COUNT"] += 1 + self.summary["INVALID"]["INVALID_DETAILS"].append(invalid_ou) + self.logger.warning(invalid_ou) + + except Exception as e: + msg = "Unexpected error occurred while creating organisation units." + self.logger.exception(msg) + raise OrgUnitCreateError(f"{msg} Check logs for details.") from e + + def _push_org_units_update( + self, + org_unit_source: pd.DataFrame, + org_unit_target: pd.DataFrame, + ou_ids_to_check: list[str], + target_dhis2: DHIS2, + dry_run: bool, + logging_interval: int = 5000, + ): + """Update org units based on matching id list.""" + if not len(ou_ids_to_check) > 0: + self._log_message("No organisation units to update.") + return + + try: + self._log_message(f"Checking for updates in {len(ou_ids_to_check)} organisation units.") + # NOTE: Geometry is valid for versions > 2.32 + if version.parse(target_dhis2.version) <= version.parse("2.32"): + org_unit_source["geometry"] = None + org_unit_target["geometry"] = None + self._log_message( + "DHIS2 version not compatible with geometry. Geometry will be ignored.", level="warning" + ) + + # build id dictionary (faster) to compare source vs target OU + index_dictionary = self._build_id_indexes(org_unit_source, org_unit_target, ou_ids_to_check) + + total_ou = len(ou_ids_to_check) + for progress_count, (_, indices) in enumerate(index_dictionary.items(), start=1): + # Create the OU and check if there are differences + # NOTE: See OrgUnitObj._eq_() to check the comparison logic + ou_source = OrgUnitObj(org_unit_source.iloc[indices["source"]]) + ou_target = OrgUnitObj(org_unit_target.iloc[indices["target"]]) + + if ou_source != ou_target: + response = self._push_org_unit( + dhis2_client=target_dhis2, + org_unit=ou_source, + strategy="UPDATE", + dry_run=dry_run, # dry_run=False -> Apply changes in the DHIS2 + is_testing=False, + ) + if response.get("status") not in ("SUCCESS", "OK"): + self.summary["UPDATE"]["ERROR_COUNT"] += 1 + self.summary["UPDATE"]["ERROR_DETAILS"].append(response) + self.logger.error(str(response)) + else: + updated_ou = { + "ACTION": "UPDATE", + "OLD_OU": str(ou_target.to_json()), + "NEW_OU": str(ou_source.to_json()), + "RESPONSE": str(response), + } + self.summary["UPDATE"]["UPDATE_COUNT"] += 1 + self.summary["UPDATE"]["UPDATE_DETAILS"].append(updated_ou) + self.logger.info(str(updated_ou)) + + if progress_count % logging_interval == 0 or progress_count == total_ou: + self._log_message(f"Organisation units checked: {progress_count}/{total_ou} for update.") + + except Exception as e: + msg = "Unexpected error occurred while updating organisation units." + self.logger.exception(msg) + raise OrgUnitUpdateError(f"{msg} Check logs for details.") from e + + def _log_message(self, message: str, level: str = "info") -> None: + """Log a message using self.logger and/or current_run.""" + if not level or not message: + return + + level = level.lower() + logger_methods = { + "info": "info", + "warning": "warning", + "error": "error", + "debug": "debug", + } + run_methods = { + "info": "log_info", + "warning": "log_warning", + "error": "log_error", + "debug": "log_debug", + } + + if level not in logger_methods: + raise ValueError(f"Unsupported logging level: {level}") + + # Log to standard logger + if self.logger and hasattr(self.logger, logger_methods[level]): + getattr(self.logger, logger_methods[level])(message) + + # Log to OpenHexa current_run + if current_run and hasattr(current_run, run_methods[level]): + getattr(current_run, run_methods[level])(message) + + def _push_org_unit( + self, + dhis2_client: DHIS2, + org_unit: OrgUnitObj, + strategy: str = "CREATE", + dry_run: bool = True, + is_testing: bool = False, + ) -> dict: + """Pushes an organisation unit to the DHIS2 instance using the specified strategy. + + Parameters + ---------- + dhis2_client : DHIS2 + DHIS2 client for the target instance. + org_unit : OrgUnitObj + Organisation unit object to be pushed. + strategy : str, optional + Strategy for pushing ('CREATE' or 'UPDATE'), by default "CREATE". + dry_run : bool, optional + If True, performs a dry run without making changes, by default True. + is_testing : bool, optional + If True, runs the function in test mode, by default False. + + Returns + ------- + dict + Formatted response from the DHIS2 API. + """ + if is_testing: + response = {"importCount": {"imported": 1, "ignored": 0}} + payload = {"status": "OK", "response": response} + r = Response() + r.status_code = 200 + r.headers = CaseInsensitiveDict({"Content-Type": "application/json"}) + r._content = json.dumps().encode("utf-8") # private attr used internally + else: + if strategy == "CREATE": + endpoint = "organisationUnits" + payload = org_unit.to_json() + + if strategy == "UPDATE": + endpoint = "metadata" + payload = {"organisationUnits": [org_unit.to_json()]} + + r = dhis2_client.api.session.post( + f"{dhis2_client.api.url}/{endpoint}", + json=payload, + params={"dryRun": dry_run, "importStrategy": f"{strategy}"}, + ) + + return self._build_formatted_response(response=r, strategy=strategy, ou_id=org_unit.id) + + def _build_formatted_response(self, response: requests.Response, strategy: str, ou_id: str) -> dict: + """Build a formatted response dictionary from a requests.Response object. + + Parameters + ---------- + response : requests.Response + The HTTP response object from the requests library. + strategy : str + The strategy or action performed. + ou_id : str + The organisational unit ID related to the response. + + Returns + ------- + dict + A dictionary containing the action, status code, status, response, and organisational unit ID. + """ + return { + "action": strategy, + "statusCode": response.status_code, + "status": response.json().get("status"), + "response": response.json().get("response"), + "ou_id": ou_id, + } + + def _build_id_indexes(self, ou_source: pd.DataFrame, ou_target: pd.DataFrame, ou_matching_ids: list) -> dict: + """Build a dictionary mapping matching OU IDs to their index positions in source and target DataFrames. + + Parameters + ---------- + ou_source : pd.DataFrame + Source DataFrame containing organisation units with an 'id' column. + ou_target : pd.DataFrame + Target DataFrame containing organisation units with an 'id' column. + ou_matching_ids : list + List of organisation unit IDs to match between source and target. + + Returns + ------- + dict + Dictionary where keys are matching IDs and values are dicts with 'source' and 'target' index positions. + """ + # Set "id" as the index for faster lookup + df1_lookup = {val: idx for idx, val in enumerate(ou_source["id"])} + df2_lookup = {val: idx for idx, val in enumerate(ou_target["id"])} + + # Build the dictionary using prebuilt lookups + return { + match_id: {"source": df1_lookup[match_id], "target": df2_lookup[match_id]} + for match_id in ou_matching_ids + if match_id in df1_lookup and match_id in df2_lookup + } diff --git a/d2d_development/d2d_development/push.py b/d2d_development/d2d_development/push.py new file mode 100644 index 0000000..766d415 --- /dev/null +++ b/d2d_development/d2d_development/push.py @@ -0,0 +1,335 @@ +import json +import logging + +import pandas as pd +import polars as pl +import requests +from openhexa.toolbox.dhis2 import DHIS2 + +from .data_models import DataPointModel +from .exceptions import PusherError +from .utils import log_message + + +class DHIS2Pusher: + """Main class to handle pushing data to DHIS2.""" + + def __init__( + self, + dhis2_client: DHIS2, + import_strategy: str = "CREATE_AND_UPDATE", + dry_run: bool = True, + max_post: int = 500, + logging_interval: int = 50000, + mandatory_fields: list[str] | None = None, + logger: logging.Logger | None = None, + ): + self.dhis2_client = dhis2_client + + if import_strategy not in {"CREATE", "UPDATE", "CREATE_AND_UPDATE"}: + raise PusherError("Invalid import strategy (use 'CREATE', 'UPDATE' or 'CREATE_AND_UPDATE')") + + if mandatory_fields is None: + self.mandatory_fields = ["dx", "period", "orgUnit", "categoryOptionCombo", "attributeOptionCombo", "value"] + else: + self.mandatory_fields = mandatory_fields + + self.import_strategy = import_strategy + self.dry_run = dry_run + self.max_post = max_post + self.logging_interval = logging_interval + self.summary = {} + self._reset_summary() + self.logger = logger if logger else logging.getLogger(__name__) + self.log_function = log_message + + def push_data( + self, + df_data: pd.DataFrame | pl.DataFrame, + ) -> None: + """Push formatted data to DHIS2.""" + self._reset_summary() + self._set_summary_import_options() + + if isinstance(df_data, pd.DataFrame): + df_data = pl.from_pandas(df_data) + + self._validate_input_data(df_data) + + if df_data.height == 0: + self._log_message("Input DataFrame is empty. No data to push.") + return + + valid, to_delete, to_ignore = self._classify_data_points(df_data) + + self._push_valid(valid) + self._push_to_delete(to_delete) + self._log_summary_errors() + self._log_ignored_or_na(to_ignore) + + def _validate_input_data(self, df_data: pl.DataFrame) -> None: + """Validate that the input DataFrame contains all mandatory fields. + + Raises + ------ + PusherError: If any mandatory field is missing from the DataFrame. + """ + if not isinstance(df_data, pl.DataFrame): + raise PusherError("Input data must be a pandas or polars DataFrame.") + + missing_fields = [field for field in self.mandatory_fields if field not in df_data.columns] + if missing_fields: + raise PusherError(f"Input data is missing mandatory columns: {', '.join(missing_fields)}") + + def _classify_data_points(self, data_points: pl.DataFrame) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]: + """Classify data points into valid, to delete, and to ignore based on mandatory fields. + + Returns + ------- + tuple: A tuple containing three lists: (valid_data_points, to_delete_data_points, to_ignore_data_points). + """ + # Valid data points have all mandatory fields non-null + valid_mask = pl.all_horizontal([pl.col(col).is_not_null() for col in self.mandatory_fields]) + valid = data_points.filter(valid_mask).select(self.mandatory_fields) + + # Data points to delete have all mandatory fields non-null except 'value' which is null + mandatory_fields_without_value = [col for col in self.mandatory_fields if col != "value"] + delete_mask = ( + pl.all_horizontal([pl.col(col).is_not_null() for col in mandatory_fields_without_value]) + & pl.col("value").is_null() + ) + to_delete = data_points.filter(delete_mask).select(self.mandatory_fields) + + # To ignore are those that don't fit either of the above criteria + not_valid = data_points.filter(~valid_mask & ~delete_mask).select(self.mandatory_fields) + + return valid, to_delete, not_valid + + def _set_summary_import_options(self): + self.summary["import_options"] = { + "importStrategy": self.import_strategy, + "dryRun": self.dry_run, + "preheatCache": True, # hardcoded for now, could be made configurable if needed + "skipAudit": True, # hardcoded for now, could be made configurable if needed + } + + def _push_valid(self, data_points_valid: pl.DataFrame) -> None: + """Push valid values to DHIS2.""" + if len(data_points_valid) == 0: + self._log_message("No data to push.") + return + + self._log_message(f"Pushing {len(data_points_valid)} data points.") + self._push_data_points(data_point_list=self._serialize_data_points(data_points_valid)) + self._log_message(f"Data points push summary: {self.summary['import_counts']}") + + def _push_to_delete(self, data_points_to_delete: pl.DataFrame) -> None: + if data_points_to_delete.height == 0: + return + + self._log_message(f"Pushing {len(data_points_to_delete)} data points with NA values.") + self._log_ignored_or_na(data_points_to_delete, is_na=True) + self._push_data_points(data_point_list=self._serialize_data_points(data_points_to_delete)) + self._log_message(f"Data points delete summary: {self.summary['import_counts']}") + + def _log_ignored_or_na(self, data_points: pl.DataFrame, is_na: bool = False): + """Logs ignored or NA data points.""" + data_points_list = data_points.to_dicts() + if len(data_points_list) > 0: + self._log_message( + f"{len(data_points_list)} data points will be {'set to NA' if is_na else 'ignored'}. " + "Please check the last execution report for details.", + level="warning", + ) + for i, ignored in enumerate(data_points_list, start=1): + row_str = ", ".join(f"{k}={v}" for k, v in ignored.items()) + self._log_message( + f"{i}. Data point {'NA' if is_na else 'ignored'}: {row_str}", log_current_run=False, level="warning" + ) + + def _log_message(self, message: str, level: str = "info", log_current_run: bool = True, error_details: str = ""): + """Log a message using the configured logging function.""" + self.log_function( + logger=self.logger, + message=message, + error_details=error_details, + level=level, + log_current_run=log_current_run, + exception_class=PusherError, + ) + + def _serialize_data_points(self, data_points: pl.DataFrame) -> list[dict]: + """Convert a Polars DataFrame of data points into a list of dictionaries for DHIS2 API. + + Returns + ------- + list[dict]: A list of dictionaries, each representing a data point formatted for DHIS2. + """ + return [ + DataPointModel( + dataElement=row["dx"], + period=row["period"], + orgUnit=row["orgUnit"], + categoryOptionCombo=row["categoryOptionCombo"], + attributeOptionCombo=row["attributeOptionCombo"], + value=row["value"], + ).to_json() + for row in data_points.to_dicts() + ] + + def _log_summary_errors(self): + """Logs all the errors in the summary dictionary using the configured logging.""" + errors = self.summary.get("ERRORS", []) + if not errors: + self._log_message("No errors found in the summary.") + else: + self._log_message(f"Logging {len(errors)} error(s) from import summary.", level="error") + for i_e, error in enumerate(errors, start=1): + self._log_message(f"Error response {i_e}: {error}", level="error") + + def _post(self, chunk: list[dict]) -> requests.Response: + """Send a POST request to DHIS2 for a chunk of data values. + + Returns + ------- + requests.Response: The response object from the DHIS2 API. + """ + return self.dhis2_client.api.session.post( + f"{self.dhis2_client.api.url}/dataValueSets", + json={"dataValues": chunk}, + params={ + "dryRun": self.dry_run, + "importStrategy": self.import_strategy, + "preheatCache": True, + "skipAudit": True, + }, + ) + + def _push_data_points( + self, + data_point_list: list[dict], + ) -> None: + """dry_run: Set to true to get an import summary without actually importing data (DHIS2).""" + total_data_points = len(data_point_list) + processed_points = 0 + last_logged_at = 0 + + for chunk_id, chunk in enumerate(self._split_list(data_point_list, self.max_post), start=1): + r = None + response = None + try: + r = self._post(chunk) + r.raise_for_status() + response = self._safe_json(r) + + if response: + self._update_import_counts(response) + + # Capture conflicts/errorReports if present + self._extract_conflicts(response) + + except requests.exceptions.RequestException as e: + self._raise_server_errors(r) # Stop the process if there's a server error + response = self._safe_json(r) + if response: + self._update_import_counts(response) + else: + # No response JSON, at least log the request error msg + self.summary["ERRORS"].extend( + [{"chunk": chunk_id, "period": chunk[0].get("period", "-"), "exception": str(e)}] + ) + self._extract_conflicts(response) + + processed_points += len(chunk) + + # Log every logging_interval points + if processed_points - last_logged_at >= self.logging_interval: + progress_pct = (processed_points / total_data_points) * 100 + self._log_message( + f"{processed_points} / {total_data_points} data points ({progress_pct:.1f}%) " + f" summary: {self.summary['import_counts']}" + ) + last_logged_at = processed_points + + # Final summary + self._log_message( + f"{processed_points} / {total_data_points} data points processed." + f" Final summary: {self.summary['import_counts']}" + ) + + def _raise_server_errors(self, r: requests.Response) -> None: + """Check if the response indicates a server error (stop process).""" + if r is not None and 500 <= r.status_code < 600: + response = self._safe_json(r) + if response and "message" in response: + message = response["message"] + else: + message = f"HTTP {r.status_code} error with no message" + + error_info = { + "server_error_code": f"{r.status_code}", + "message": f"Server error: {message}", + } + self.summary["ERRORS"].append(error_info) + raise PusherError(f"Server error: {message}") from None + + def _reset_summary(self) -> None: + self.summary = { + "import_counts": {"imported": 0, "updated": 0, "ignored": 0, "deleted": 0}, + "import_options": {}, + "ERRORS": [], + } + + def _split_list(self, src_list: list, length: int): + """Split list into chunks. + + Yields: + list: A chunk of the source list of the specified length. + """ + for i in range(0, len(src_list), length): + yield src_list[i : i + length] + + def _safe_json(self, r: requests.Response) -> dict | None: + if r is None: + return None + + try: + return r.json() + except (ValueError, json.JSONDecodeError): + return None + + def _update_import_counts(self, response: dict) -> None: + if not response: + return + if "importCount" in response: + import_counts = response.get("importCount", {}) + elif "response" in response and "importCount" in response["response"]: + import_counts = response["response"].get("importCount", {}) + else: + import_counts = {} + for key in ["imported", "updated", "ignored", "deleted"]: + self.summary["import_counts"][key] += import_counts.get(key, 0) + + def _extract_conflicts(self, response: dict) -> None: + """Extract all conflicts and errorReports from a DHIS2 API response. + + Handles both top-level and nested 'response' nodes. Optionally updates the summary. + + Parameters + ---------- + response : dict + The JSON response from DHIS2 after an import. + """ + if not response: + return + conflicts = response.get("conflicts", []) + error_reports = response.get("errorReports", []) + + # Check if nested under "response" + nested = response.get("response", {}) + conflicts += nested.get("conflicts", []) + error_reports += nested.get("errorReports", []) + all_errors = conflicts + error_reports + + if all_errors: + self.summary.setdefault("ERRORS", []).extend(all_errors) diff --git a/d2d_development/d2d_development/utils.py b/d2d_development/d2d_development/utils.py new file mode 100644 index 0000000..fceb493 --- /dev/null +++ b/d2d_development/d2d_development/utils.py @@ -0,0 +1,89 @@ +import logging +import tempfile +from pathlib import Path + +import pandas as pd +import polars as pl +from openhexa.sdk import current_run + + +def log_message( + logger: logging.Logger, + message: str, + error_details: str = "", + log_current_run: bool = True, + level: str = "info", + exception_class: type[Exception] = Exception, +) -> None: + """Log a message to both the current run and the configured logger. + + Parameters + ---------- + logger : logging.Logger + The logger to use for logging the message. + message : str + The message to log. + error_details : str, optional + Additional details to include in error logs, by default "". + log_current_run : bool, optional + Whether to log the message to the current run, by default True. + level : str, optional + The logging level ('info', 'warning', 'error'), by default 'info'. + exception_class : Exception, optional + The exception class type to raise for invalid logging levels, by default Exception. + """ + if level == "info": + logger.info(message) + elif level == "warning": + logger.warning(message) + elif level == "error": + logger.error(f"{message} Details: {error_details}") + else: + raise exception_class(f"Invalid logging level: {level}") + + # Log to current_run only if it exists + if log_current_run and "current_run" in globals() and current_run is not None: + if level == "info": + current_run.log_info(message) + elif level == "warning": + current_run.log_warning(message) + elif level == "error": + current_run.log_error(message) + + +def save_to_parquet(data: pl.DataFrame | pd.DataFrame, filename: Path) -> None: + """Safely saves a Pandas or Polars DataFrame to a Parquet file using a temporary file and atomic replace. + + Args: + data (Union[pl.DataFrame, pd.DataFrame]): The DataFrame to save. + filename (Path): The path where the Parquet file will be saved. + + Raises: + ValueError: If data is not a valid DataFrame. + Exception: If saving fails. + """ + temp_filename = None + try: + # Validate input type + if not isinstance(data, (pl.DataFrame, pd.DataFrame)): + raise ValueError("The 'data' parameter must be a Pandas or Polars DataFrame.") + + # Write to a temporary file in the same directory + with tempfile.NamedTemporaryFile(suffix=".parquet", dir=filename.parent, delete=False) as tmp_file: + temp_filename = Path(tmp_file.name) + + # Use appropriate write method based on DataFrame type + if isinstance(data, pl.DataFrame): + data.write_parquet(temp_filename) + else: # pd.DataFrame + data.to_parquet(temp_filename, index=False) + + # Atomically replace the old file with the new one + temp_filename.replace(filename) + temp_filename = None # Mark as successfully moved + + except Exception as e: + # Clean up the temp file if it exists + if temp_filename is not None and temp_filename.exists(): + temp_filename.unlink() + raise Exception(f"Failed to save parquet file to {filename}") from e diff --git a/d2d_development/pyproject.toml b/d2d_development/pyproject.toml new file mode 100644 index 0000000..96cf936 --- /dev/null +++ b/d2d_development/pyproject.toml @@ -0,0 +1,95 @@ +[build-system] +# These lines tell pip what tools are needed to actually build your package. +# This is standard for almost all modern Python projects. +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "d2d-development" +version = "0.0.0" +description = "OpenHEXA DHIS2 to DHIS2 development utililty library maintained by Bluesquare Data Services team." +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "openhexa-toolbox>=2.0.0", + "openhexa.sdk>=1.0.0", # Baseline version for safety + "requests>=2.31.0", # Baseline for modern security/features + "pandas>=2.2.0", # Use >=2.2 for compatibility + "polars>=1.0.0" +] + +[tool.setuptools] +package-dir = {"" = ""} + +[tool.setuptools.packages.find] +# Ensures it finds the 'd2d_development' folder inside 'src' +where = ["."] + +[tool.ruff] +line-length = 120 + +[tool.ruff.format] +docstring-code-format = true +docstring-code-line-length = 120 + +[tool.ruff.lint] +preview = true +select = [ + "F", # Pyflakes + "E", # pycodestyle + "I", # isort + "D", # pydocstyle + "UP", # pyupgrade + "ANN", # flake8-annotations + "B", # bugbear + "A", # flake8-builtins + "COM", # flake8-commas + "FA", # flake8-future-annotations + "PT", # flake8-pytest-style + "Q", # flake8-quotes + "RET", # flake8-return + "SIM", # flake8-simplify + "PTH", # flake8-use-pathlib + "NPY", # NumPy rules + "PD", # pandas rules + "N", # pep8-naming + "DOC", # pydoclint + "PLC", # pylint convention + "PLE", # pylint error + "PLW", # pylint warning + "RUF", # ruff specific rules +] + +ignore = [ + "D100", # Missing docstring in public module + "D104", # Missing docstring in public package + "D105", # Missing docstring in magic method + "D106", # Missing docstring in public nested class + "D107", # Missing docstring in __init__ + "D401", # First line should be in imperative mood + "D413", # Missing blank line after last section + "D203", # 1 blank line required before class docstring + "SIM108", # Use ternary operators + "SIM102", # Use a single if statement instead of nested if statements + "SIM114", # Combine `if` branches + "DOC501", # Raised exception {id} missing from docstring + "DOC502", # Raised exception is not explicitly raised: {id} + "RUF022", # `__all__` is not sorted + "RUF005", # Consider expression instead of concatenation + "PD901", # Avoid using the generic variable name df for dataframes + "PLR0904", # Too many public methods ({methods} > {max_methods}) + "PLR0911", # Too many return statements ({returns} > {max_returns}) + "PLR0912", # Too many branches ({branches} > {max_branches}) + "PLR0913", # Too many arguments ({arguments} > {max_arguments}) + "PLR0914", # Too many local variables ({variables} > {max_variables}) + "PLR0915", # Too many statements ({statements} > {max_statements}) + "PLR0916", # Too many Boolean expressions ({expressions} > {max_expressions}) + "PLR1702", # Too many nested blocks ({blocks} > {max_blocks}), + "COM812", # Missing trailing comma +] + +[tool.ruff.lint.flake8-annotations] +allow-star-arg-any = true +mypy-init-return = true +suppress-dummy-args = true +suppress-none-returning = true diff --git a/d2d_development/tests/__init__.py b/d2d_development/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/d2d_development/tests/mock_dhis2_get.py b/d2d_development/tests/mock_dhis2_get.py new file mode 100644 index 0000000..0c18c7f --- /dev/null +++ b/d2d_development/tests/mock_dhis2_get.py @@ -0,0 +1,238 @@ +class MockDataValueSets: + """Mock class to simulate DHIS2 DataValueSets API responses for testing purposes.""" + + def get(self, data_elements=None, periods=None, org_units=None, last_updated=None) -> list[dict]: # noqa: ANN001 + """Simulate the retrieval of data values from DHIS2 based on the provided parameters. + + Returns + ------- + list[dict] + A list of dictionaries representing data values, formatted similarly to what the DHIS2 API would + """ + # Return a mock response for data elements + # You can customize the returned data for your tests + return [ + { + "dataElement": "AAA111", + "period": "202501", + "orgUnit": "ORG001", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "12", + "storedBy": "user1", + "created": "2025-01-01T10:00:00.000+0000", + "lastUpdated": "2025-01-01T10:05:00.000+0000", + "comment": None, + "followup": False, + }, + { + "dataElement": "BBB222", + "period": "202501", + "orgUnit": "ORG002", + "categoryOptionCombo": "CAT002", + "attributeOptionCombo": "ATTR002", + "value": "18", + "storedBy": "user2", + "created": "2025-01-02T11:00:00.000+0000", + "lastUpdated": "2025-01-02T11:05:00.000+0000", + "comment": None, + "followup": False, + }, + { + "dataElement": "CCC333", + "period": "202501", + "orgUnit": "ORG003", + "categoryOptionCombo": "CAT003", + "attributeOptionCombo": "ATTR003", + "value": "25", + "storedBy": "user3", + "created": "2025-01-03T12:00:00.000+0000", + "lastUpdated": "2025-01-03T12:05:00.000+0000", + "comment": None, + "followup": False, + }, + { + "dataElement": "DELETE1", + "period": "202501", + "orgUnit": "ORG004", + "categoryOptionCombo": "CAT004", + "attributeOptionCombo": "ATTR004", + "value": None, + "storedBy": "user3", + "created": "2025-01-03T12:00:00.000+0000", + "lastUpdated": "2025-01-03T12:05:00.000+0000", + "comment": None, + "followup": False, + }, + { + "dataElement": "INVALID1", + "period": None, + "orgUnit": "ORG005", + "categoryOptionCombo": "CAT005", + "attributeOptionCombo": "ATTR005", + "value": "55.0", + "storedBy": "user3", + "created": "2025-01-03T12:00:00.000+0000", + "lastUpdated": "2025-01-03T12:05:00.000+0000", + "comment": None, + "followup": False, + }, + { + "dataElement": "INVALID2", + "period": "202501", + "orgUnit": None, + "categoryOptionCombo": "CAT005", + "attributeOptionCombo": "ATTR005", + "value": "55.0", + "storedBy": "user3", + "created": "2025-01-03T12:00:00.000+0000", + "lastUpdated": "2025-01-03T12:05:00.000+0000", + "comment": None, + "followup": False, + }, + { + "dataElement": "INVALID3", + "period": "202501", + "orgUnit": "ORG005", + "categoryOptionCombo": None, + "attributeOptionCombo": "ATTR005", + "value": "55.0", + "storedBy": "user3", + "created": "2025-01-03T12:00:00.000+0000", + "lastUpdated": "2025-01-03T12:05:00.000+0000", + "comment": None, + "followup": False, + }, + { + "dataElement": "INVALID4", + "period": "202501", + "orgUnit": "ORG005", + "categoryOptionCombo": "CAT005", + "attributeOptionCombo": None, + "value": "55.0", + "storedBy": "user3", + "created": "2025-01-03T12:00:00.000+0000", + "lastUpdated": "2025-01-03T12:05:00.000+0000", + "comment": None, + "followup": False, + }, + { + "dataElement": None, + "period": "202501", + "orgUnit": "ORG006", + "categoryOptionCombo": "CAT006", + "attributeOptionCombo": "ATTR006", + "value": "55.0", + "storedBy": "user3", + "created": "2025-01-03T12:00:00.000+0000", + "lastUpdated": "2025-01-03T12:05:00.000+0000", + "comment": None, + "followup": False, + }, + ] + + +class MockAnalytics: + """Mock class to simulate DHIS2 Analytics API responses for testing purposes.""" + + def get(self, indicators=None, data_elements=None, periods=None, org_units=None, include_cocs=False) -> list[dict]: # noqa: ANN001 + """Simulate the retrieval of analytics data from DHIS2 based on the provided parameters. + + Returns + ------- + list[dict] + A list of dictionaries representing analytics data, formatted similarly to what the DHIS2 API would + """ + if data_elements: + return [ + { + "dx": "AAA111.REPORTING_RATE", + "pe": "202409", + "ou": "OU001", + "value": "100", + }, + { + "dx": "BBB222.EXPECTED_REPORTS", + "pe": "202409", + "ou": "OU002", + "value": "0", + }, + { + "dx": "CCC333.REPORTING_RATE", + "pe": "202409", + "ou": "OU003", + "value": "100", + }, + ] + + if include_cocs: + return [ + { + "dx": "DATAELEMENT1", + "pe": "202501", + "ou": "ORG001", + "co": "COC001", + "value": "6.0", + }, + { + "dx": "DATAELEMENT2", + "pe": "202501", + "ou": "ORG002", + "co": "COC002", + "value": "7.0", + }, + { + "dx": "DATAELEMENT3", + "pe": "202501", + "ou": "ORG003", + "co": "COC003", + "value": "8.0", + }, + ] + return [ + { + "dx": "INDICATOR1", + "pe": "202501", + "ou": "ORG001", + "value": "5.0", + }, + { + "dx": "INDICATOR2", + "pe": "202501", + "ou": "ORG002", + "value": "7.0", + }, + { + "dx": "INDICATOR3", + "pe": "202501", + "ou": "ORG003", + "value": "9.0", + }, + ] + + +class MockSession: + """Mock class to simulate a requests.Session for testing purposes.""" + + def post(self, *args, **kwargs: object) -> None: # noqa: ANN002 + """Simulate a POST request to the DHIS2 API.""" + # This will be patched in your test + pass + + +class MockAPI: + """Mock class to simulate a DHIS2 API client for testing purposes.""" + + def __init__(self): + self.session = MockSession() + self.url = "https://mock-dhis2-instance.org/api" + + +class MockDHIS2Client: + """Mock class to simulate a DHIS2 client for testing purposes.""" + + def __init__(self): + self.data_value_sets = MockDataValueSets() + self.analytics = MockAnalytics() + self.api = MockAPI() + self.session = MockSession() diff --git a/d2d_development/tests/mock_dhis2_post.py b/d2d_development/tests/mock_dhis2_post.py new file mode 100644 index 0000000..8e937e8 --- /dev/null +++ b/d2d_development/tests/mock_dhis2_post.py @@ -0,0 +1,484 @@ +import requests + + +class MockDHIS2Response: + """Mock class to simulate a response from the DHIS2 API for testing purposes.""" + + def __init__(self, json_data, status_code=200): # noqa: ANN001 + self._json_data = json_data + self.status_code = status_code + + def json(self) -> dict: # noqa: D102 + return self._json_data + + def raise_for_status(self): # noqa: D102 + if not (200 <= self.status_code < 300): + raise requests.exceptions.HTTPError(f"HTTP {self.status_code}") + + +MOCK_DHIS2_ERROR_503_RESPONSE = { + "httpStatus": "Service Unavailable", + "httpStatusCode": 503, + "status": "ERROR", + "message": "Service temporarily unavailable", +} + +# Example OK response DHIS2 version: '2.40.9' +MOCK_DHIS2_OK_RESPONSE = { + "httpStatus": "OK", + "httpStatusCode": 200, + "status": "OK", + "message": "Import was successful.", + "response": { + "responseType": "ImportSummary", + "status": "SUCCESS", + "importOptions": { + "idSchemes": {}, + "dryRun": True, + "preheatCache": True, + "async": False, + "importStrategy": "CREATE_AND_UPDATE", + "mergeMode": "REPLACE", + "reportMode": "FULL", + "skipExistingCheck": False, + "sharing": False, + "skipNotifications": False, + "skipAudit": True, + "datasetAllowsPeriods": False, + "strictPeriods": False, + "strictDataElements": False, + "strictCategoryOptionCombos": False, + "strictAttributeOptionCombos": False, + "strictOrganisationUnits": False, + "strictDataSetApproval": False, + "strictDataSetLocking": False, + "strictDataSetInputPeriods": False, + "requireCategoryOptionCombo": False, + "requireAttributeOptionCombo": False, + "skipPatternValidation": False, + "ignoreEmptyCollection": False, + "force": False, + "firstRowIsHeader": True, + "skipLastUpdated": False, + "mergeDataValues": False, + "skipCache": False, + }, + "description": "Import process completed successfully", + "importCount": {"imported": 1, "updated": 0, "ignored": 0, "deleted": 0}, + "conflicts": [], + "rejectedIndexes": [], + "dataSetComplete": "false", + }, +} + +# Example 409 conflict response DHIS2 version: '2.40.9' +MOCK_DHIS2_ERROR_409_RESPONSE_DE = { + "httpStatus": "Conflict", + "httpStatusCode": 409, + "status": "WARNING", + "message": "One more conflicts encountered, please check import summary.", + "response": { + "responseType": "ImportSummary", + "status": "WARNING", + "importOptions": { + "idSchemes": {}, + "dryRun": True, + "preheatCache": True, + "async": False, + "importStrategy": "CREATE_AND_UPDATE", + "mergeMode": "REPLACE", + "reportMode": "FULL", + "skipExistingCheck": False, + "sharing": False, + "skipNotifications": False, + "skipAudit": True, + "datasetAllowsPeriods": False, + "strictPeriods": False, + "strictDataElements": False, + "strictCategoryOptionCombos": False, + "strictAttributeOptionCombos": False, + "strictOrganisationUnits": False, + "strictDataSetApproval": False, + "strictDataSetLocking": False, + "strictDataSetInputPeriods": False, + "requireCategoryOptionCombo": False, + "requireAttributeOptionCombo": False, + "skipPatternValidation": False, + "ignoreEmptyCollection": False, + "force": False, + "firstRowIsHeader": True, + "skipLastUpdated": False, + "mergeDataValues": False, + "skipCache": False, + }, + "description": "Import process completed successfully", + "importCount": { + "imported": 1, + "updated": 0, + "ignored": 2, + "deleted": 0, + }, + "conflicts": [ + { + "object": "INVALID_1", + "objects": {"dataElement": "INVALID_1"}, + "value": "Data element not found or not accessible: `INVALID_1`", + "errorCode": "E7610", + "property": "dataElement", + "indexes": [1], + }, + { + "object": "INVALID_2", + "objects": {"dataElement": "INVALID_2"}, + "value": "Data element not found or not accessible: `INVALID_2`", + "errorCode": "E7610", + "property": "dataElement", + "indexes": [2], + }, + ], + "rejectedIndexes": [1, 2], + "dataSetComplete": "false", + }, +} + +# Example 409 conflict response DHIS2 version: '2.40.9' +MOCK_DHIS2_ERROR_409_RESPONSE_ORG_UNITS = { + "httpStatus": "Conflict", + "httpStatusCode": 409, + "status": "WARNING", + "message": "One more conflicts encountered, please check import summary.", + "response": { + "responseType": "ImportSummary", + "status": "WARNING", + "importOptions": { + "idSchemes": {}, + "dryRun": True, + "preheatCache": True, + "async": False, + "importStrategy": "CREATE_AND_UPDATE", + "mergeMode": "REPLACE", + "reportMode": "FULL", + "skipExistingCheck": False, + "sharing": False, + "skipNotifications": False, + "skipAudit": True, + "datasetAllowsPeriods": False, + "strictPeriods": False, + "strictDataElements": False, + "strictCategoryOptionCombos": False, + "strictAttributeOptionCombos": False, + "strictOrganisationUnits": False, + "strictDataSetApproval": False, + "strictDataSetLocking": False, + "strictDataSetInputPeriods": False, + "requireCategoryOptionCombo": False, + "requireAttributeOptionCombo": False, + "skipPatternValidation": False, + "ignoreEmptyCollection": False, + "force": False, + "firstRowIsHeader": True, + "skipLastUpdated": False, + "mergeDataValues": False, + "skipCache": False, + }, + "description": "Import process completed successfully", + "importCount": { + "imported": 1, + "updated": 0, + "ignored": 2, + "deleted": 0, + }, + "conflicts": [ + { + "object": "INVALID_1_OU", + "objects": {"organisationUnit": "INVALID_1_OU"}, + "value": "Organisation unit not found or not accessible: `INVALID_1_OU`", + "errorCode": "E7612", + "property": "orgUnit", + "indexes": [1], + }, + { + "object": "INVALID_2_OU", + "objects": {"organisationUnit": "INVALID_2_OU"}, + "value": "Organisation unit not found or not accessible: `INVALID_2_OU`", + "errorCode": "E7612", + "property": "orgUnit", + "indexes": [2], + }, + ], + "rejectedIndexes": [1, 2], + "dataSetComplete": "false", + }, +} + +# Example 409 conflict response DHIS2 version: '2.40.9' +MOCK_DHIS2_ERROR_409_RESPONSE_PERIOD = { + "httpStatus": "Conflict", + "httpStatusCode": 409, + "status": "WARNING", + "message": "One more conflicts encountered, please check import summary.", + "response": { + "responseType": "ImportSummary", + "status": "WARNING", + "importOptions": { + "idSchemes": {}, + "dryRun": True, + "preheatCache": True, + "async": False, + "importStrategy": "CREATE_AND_UPDATE", + "mergeMode": "REPLACE", + "reportMode": "FULL", + "skipExistingCheck": False, + "sharing": False, + "skipNotifications": False, + "skipAudit": True, + "datasetAllowsPeriods": False, + "strictPeriods": False, + "strictDataElements": False, + "strictCategoryOptionCombos": False, + "strictAttributeOptionCombos": False, + "strictOrganisationUnits": False, + "strictDataSetApproval": False, + "strictDataSetLocking": False, + "strictDataSetInputPeriods": False, + "requireCategoryOptionCombo": False, + "requireAttributeOptionCombo": False, + "skipPatternValidation": False, + "ignoreEmptyCollection": False, + "force": False, + "firstRowIsHeader": True, + "skipLastUpdated": False, + "mergeDataValues": False, + "skipCache": False, + }, + "description": "Import process completed successfully", + "importCount": { + "imported": 1, + "updated": 0, + "ignored": 2, + "deleted": 0, + }, + "conflicts": [ + { + "object": "INVALID_PERIOD_1", + "objects": {"period": "INVALID_PERIOD_1"}, + "value": "Period not valid: `INVALID_PERIOD_1`", + "errorCode": "E7611", + "property": "period", + "indexes": [1], + }, + { + "object": "INVALID_PERIOD_2", + "objects": {"period": "INVALID_PERIOD_2"}, + "value": "Period not valid: `INVALID_PERIOD_2`", + "errorCode": "E7611", + "property": "period", + "indexes": [2], + }, + ], + "rejectedIndexes": [1, 2], + "dataSetComplete": "false", + }, +} + +# Example 409 conflict response DHIS2 version: '2.40.9' +MOCK_DHIS2_ERROR_409_RESPONSE_COC = { + "httpStatus": "Conflict", + "httpStatusCode": 409, + "status": "WARNING", + "message": "One more conflicts encountered, please check import summary.", + "response": { + "responseType": "ImportSummary", + "status": "WARNING", + "importOptions": { + "idSchemes": {}, + "dryRun": True, + "preheatCache": True, + "async": False, + "importStrategy": "CREATE_AND_UPDATE", + "mergeMode": "REPLACE", + "reportMode": "FULL", + "skipExistingCheck": False, + "sharing": False, + "skipNotifications": False, + "skipAudit": True, + "datasetAllowsPeriods": False, + "strictPeriods": False, + "strictDataElements": False, + "strictCategoryOptionCombos": False, + "strictAttributeOptionCombos": False, + "strictOrganisationUnits": False, + "strictDataSetApproval": False, + "strictDataSetLocking": False, + "strictDataSetInputPeriods": False, + "requireCategoryOptionCombo": False, + "requireAttributeOptionCombo": False, + "skipPatternValidation": False, + "ignoreEmptyCollection": False, + "force": False, + "firstRowIsHeader": True, + "skipLastUpdated": False, + "mergeDataValues": False, + "skipCache": False, + }, + "description": "Import process completed successfully", + "importCount": { + "imported": 1, + "updated": 0, + "ignored": 2, + "deleted": 0, + }, + "conflicts": [ + { + "object": "INVALID_COC_1", + "objects": {"categoryOptionCombo": "INVALID_COC_1"}, + "value": "Category option combo not found or not accessible for writing data: `INVALID_COC_1`", + "errorCode": "E7613", + "property": "categoryOptionCombo", + "indexes": [1], + }, + { + "object": "INVALID_COC_2", + "objects": {"categoryOptionCombo": "INVALID_COC_2"}, + "value": "Category option combo not found or not accessible for writing data: `INVALID_COC_2`", + "errorCode": "E7613", + "property": "categoryOptionCombo", + "indexes": [2], + }, + ], + "rejectedIndexes": [1, 2], + "dataSetComplete": "false", + }, +} + +# Example 409 conflict response DHIS2 version: '2.40.9' +MOCK_DHIS2_ERROR_409_RESPONSE_AOC = { + "httpStatus": "Conflict", + "httpStatusCode": 409, + "status": "WARNING", + "message": "One more conflicts encountered, please check import summary.", + "response": { + "responseType": "ImportSummary", + "status": "WARNING", + "importOptions": { + "idSchemes": {}, + "dryRun": True, + "preheatCache": True, + "async": False, + "importStrategy": "CREATE_AND_UPDATE", + "mergeMode": "REPLACE", + "reportMode": "FULL", + "skipExistingCheck": False, + "sharing": False, + "skipNotifications": False, + "skipAudit": True, + "datasetAllowsPeriods": False, + "strictPeriods": False, + "strictDataElements": False, + "strictCategoryOptionCombos": False, + "strictAttributeOptionCombos": False, + "strictOrganisationUnits": False, + "strictDataSetApproval": False, + "strictDataSetLocking": False, + "strictDataSetInputPeriods": False, + "requireCategoryOptionCombo": False, + "requireAttributeOptionCombo": False, + "skipPatternValidation": False, + "ignoreEmptyCollection": False, + "force": False, + "firstRowIsHeader": True, + "skipLastUpdated": False, + "mergeDataValues": False, + "skipCache": False, + }, + "description": "Import process completed successfully", + "importCount": { + "imported": 1, + "updated": 0, + "ignored": 2, + "deleted": 0, + }, + "conflicts": [ + { + "object": "INVALID_AOC_1", + "objects": {"categoryOptionCombo": "INVALID_AOC_1"}, + "value": "Attribute option combo not found or not accessible for writing data: `INVALID_AOC_1`", + "errorCode": "E7615", + "property": "attributeOptionCombo", + "indexes": [1], + }, + { + "object": "INVALID_AOC_2", + "objects": {"categoryOptionCombo": "INVALID_AOC_2"}, + "value": "Attribute option combo not found or not accessible for writing data: `INVALID_AOC_2`", + "errorCode": "E7615", + "property": "attributeOptionCombo", + "indexes": [2], + }, + ], + "rejectedIndexes": [1, 2], + "dataSetComplete": "false", + }, +} + +# Example 409 conflict response DHIS2 version: '2.40.9' +MOCK_DHIS2_ERROR_409_RESPONSE_VALUE_FORMAT = { + "httpStatus": "Conflict", + "httpStatusCode": 409, + "status": "WARNING", + "message": "One more conflicts encountered, please check import summary.", + "response": { + "responseType": "ImportSummary", + "status": "WARNING", + "importOptions": { + "idSchemes": {}, + "dryRun": True, + "preheatCache": True, + "async": False, + "importStrategy": "CREATE_AND_UPDATE", + "mergeMode": "REPLACE", + "reportMode": "FULL", + "skipExistingCheck": False, + "sharing": False, + "skipNotifications": False, + "skipAudit": True, + "datasetAllowsPeriods": False, + "strictPeriods": False, + "strictDataElements": False, + "strictCategoryOptionCombos": False, + "strictAttributeOptionCombos": False, + "strictOrganisationUnits": False, + "strictDataSetApproval": False, + "strictDataSetLocking": False, + "strictDataSetInputPeriods": False, + "requireCategoryOptionCombo": False, + "requireAttributeOptionCombo": False, + "skipPatternValidation": False, + "ignoreEmptyCollection": False, + "force": False, + "firstRowIsHeader": True, + "skipLastUpdated": False, + "mergeDataValues": False, + "skipCache": False, + }, + "description": "Import process completed successfully", + "importCount": { + "imported": 2, + "updated": 0, + "ignored": 1, + "deleted": 0, + }, + "conflicts": [ + { + "object": "VALID2", + "objects": {"dataElement": "VALID2", "value": "value_not_numeric"}, + "value": "Value must match value type of data element `VALID2`: `La valeur est non numérique`", + "errorCode": "E7619", + "property": "value", + "indexes": [1], + } + ], + "rejectedIndexes": [1], + "dataSetComplete": "false", + }, +} diff --git a/d2d_development/tests/test_data_point.py b/d2d_development/tests/test_data_point.py new file mode 100644 index 0000000..824ba02 --- /dev/null +++ b/d2d_development/tests/test_data_point.py @@ -0,0 +1,77 @@ +import polars as pl + +from d2d_development.data_models import DataPointModel +from tests.mock_dhis2_get import MockDHIS2Client + + +def test_data_point_model_to_str(): + """Test conversion of a Polars DataFrame to JSON using the DataPointModel.""" + single_point = DataPointModel( + dataElement="de1", + period="202601", + orgUnit="OU1", + categoryOptionCombo="coc1", + attributeOptionCombo="aoc1", + value="100.2", + ) + + assert "dataElement=de1" in str(single_point) + assert "period=202601" in str(single_point) + assert "orgUnit=OU1" in str(single_point) + assert "categoryOptionCombo=coc1" in str(single_point) + assert "attributeOptionCombo=aoc1" in str(single_point) + assert "value=100.2" in str(single_point) + + +def test_data_point_model_to_json(): + """Test conversion of a Polars DataFrame to JSON using the DataPointModel.""" + data_elements = pl.DataFrame(MockDHIS2Client().data_value_sets.get()) + single_point = DataPointModel( + dataElement=data_elements[0]["dataElement"].item(), + period=data_elements[0]["period"].item(), + orgUnit=data_elements[0]["orgUnit"].item(), + categoryOptionCombo=data_elements[0]["categoryOptionCombo"].item(), + attributeOptionCombo=data_elements[0]["attributeOptionCombo"].item(), + value=data_elements[0]["value"].item(), + ) + + payload = single_point.to_json() + assert payload["dataElement"] == data_elements[0]["dataElement"].item() + assert payload["period"] == data_elements[0]["period"].item() + assert payload["orgUnit"] == data_elements[0]["orgUnit"].item() + assert payload["categoryOptionCombo"] == data_elements[0]["categoryOptionCombo"].item() + assert payload["attributeOptionCombo"] == data_elements[0]["attributeOptionCombo"].item() + assert payload["value"] == data_elements[0]["value"].item() + + +def test_data_point_model_to_json_delete(): + """Test conversion of a Polars DataFrame to JSON using the DataPointModel.""" + data_elements = pl.DataFrame(MockDHIS2Client().data_value_sets.get()).slice(2, 2) + + # Set third datapoint to value None to simulate a deleted value + data_elements = data_elements.with_columns( + pl.when(pl.arange(0, data_elements.height) == 2).then(None).otherwise(pl.col("value")).alias("value") + ) + points_list = [ + DataPointModel( + dataElement=row["dataElement"], + period=row["period"], + orgUnit=row["orgUnit"], + categoryOptionCombo=row["categoryOptionCombo"], + attributeOptionCombo=row["attributeOptionCombo"], + value=row["value"], + ).to_json() + for row in data_elements.to_dicts() + ] + + assert len(points_list) == 2 + assert points_list[0]["dataElement"] == "CCC333" + assert points_list[0]["period"] == "202501" + assert points_list[0]["orgUnit"] == "ORG003" + assert points_list[0]["categoryOptionCombo"] == "CAT003" + assert points_list[0]["attributeOptionCombo"] == "ATTR003" + assert points_list[0]["value"] == "25" + assert points_list[0].get("comment") is None + assert points_list[1]["dataElement"] == "DELETE1" + assert not points_list[1]["value"] + assert points_list[1]["comment"] == "deleted value" diff --git a/d2d_development/tests/test_extract.py b/d2d_development/tests/test_extract.py new file mode 100644 index 0000000..8fd1175 --- /dev/null +++ b/d2d_development/tests/test_extract.py @@ -0,0 +1,259 @@ +import time +from unittest.mock import patch + +import polars as pl + +from d2d_development.extract import DHIS2Extractor +from tests.mock_dhis2_get import MockDHIS2Client + + +def test_extract_map_data_elements(): + """Test the mapping of data elements.""" + result = DHIS2Extractor(dhis2_client=MockDHIS2Client()).data_elements._retrieve_data( + data_elements=[], org_units=[], period="202501" + ) + assert isinstance(result, pl.DataFrame) + assert result.shape == (9, 9) + assert result.columns == [ + "dataType", + "dx", + "period", + "orgUnit", + "categoryOptionCombo", + "attributeOptionCombo", + "rateMetric", + "domainType", + "value", + ] + assert set(result["dataType"]) == {"DATA_ELEMENT"} + assert set(result["dx"].drop_nulls()) == { + "AAA111", + "BBB222", + "CCC333", + "DELETE1", + "INVALID1", + "INVALID2", + "INVALID3", + "INVALID4", + } + assert set(result["period"].drop_nulls()) == {"202501"} + assert set(result["orgUnit"].drop_nulls()) == { + "ORG001", + "ORG003", + "ORG005", + "ORG006", + "ORG002", + "ORG004", + } + assert set(result["categoryOptionCombo"].drop_nulls()) == { + "CAT006", + "CAT005", + "CAT003", + "CAT002", + "CAT001", + "CAT004", + } + assert set(result["attributeOptionCombo"].drop_nulls()) == { + "ATTR001", + "ATTR002", + "ATTR003", + "ATTR004", + "ATTR005", + "ATTR006", + } + assert set(result["rateMetric"]) == {None} + assert set(result["domainType"]) == {"AGGREGATED"} + assert set(result["value"].drop_nulls()) == {"12", "18", "25", "55.0"} + + +def test_extract_map_reporting_rates(): + """Test the mapping of reporting rates.""" + result = DHIS2Extractor(dhis2_client=MockDHIS2Client()).reporting_rates._retrieve_data( + reporting_rates=["AAA111.REPORTING_RATE", "BBB222.EXPECTED_REPORTS", "CCC333.REPORTING_RATE"], + org_units=[], + period="202409", + ) + assert isinstance(result, pl.DataFrame) + assert result.shape == (3, 9) + assert result.columns == [ + "dataType", + "dx", + "period", + "orgUnit", + "categoryOptionCombo", + "attributeOptionCombo", + "rateMetric", + "domainType", + "value", + ] + assert result["dataType"].unique().to_list() == ["REPORTING_RATE"] + assert result["dx"].to_list() == ["AAA111", "BBB222", "CCC333"] + assert result["period"].to_list() == ["202409", "202409", "202409"] + assert result["orgUnit"].to_list() == ["OU001", "OU002", "OU003"] + assert result["categoryOptionCombo"].to_list() == [None, None, None] + assert result["attributeOptionCombo"].to_list() == [None, None, None] + assert result["rateMetric"].to_list() == ["REPORTING_RATE", "EXPECTED_REPORTS", "REPORTING_RATE"] + assert result["domainType"].to_list() == ["AGGREGATED", "AGGREGATED", "AGGREGATED"] + assert result["value"].to_list() == ["100", "0", "100"] + + +def test_extract_map_indicator(): + """Test the mapping of indicators.""" + result = DHIS2Extractor(dhis2_client=MockDHIS2Client()).indicators._retrieve_data( + indicators=["INDICATOR1", "INDICATOR2", "INDICATOR3"], org_units=[], period="202501" + ) + assert isinstance(result, pl.DataFrame) + assert result.shape == (3, 9) + assert result.columns == [ + "dataType", + "dx", + "period", + "orgUnit", + "categoryOptionCombo", + "attributeOptionCombo", + "rateMetric", + "domainType", + "value", + ] + assert result["dataType"].unique().to_list() == ["INDICATOR"] + assert result["dx"].to_list() == ["INDICATOR1", "INDICATOR2", "INDICATOR3"] + assert result["period"].to_list() == ["202501", "202501", "202501"] + assert result["orgUnit"].to_list() == ["ORG001", "ORG002", "ORG003"] + assert result["categoryOptionCombo"].to_list() == [None, None, None] + assert result["attributeOptionCombo"].to_list() == [None, None, None] + assert result["rateMetric"].to_list() == [None, None, None] + assert result["domainType"].to_list() == ["AGGREGATED", "AGGREGATED", "AGGREGATED"] + assert result["value"].to_list() == ["5.0", "7.0", "9.0"] + + +def test_extract_download_replace_no_file(tmp_path): # noqa: ANN001 + """Test DOWNLOAD_REPLACE mode, downloads and saves data to a Parquet file.""" + extractor = DHIS2Extractor(dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_REPLACE") + filename = "test_extract_202501.parquet" + + # Call download_period + result_path = extractor.data_elements.download_period( + data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + ) + + # Assert file is created + assert result_path.exists() + assert result_path.name == filename + + +def test_download_replace_replaces_file_and_logs(tmp_path): # noqa: ANN001 + """Test DOWNLOAD_REPLACE mode, replaces the file if it already exists and logs the replacement.""" + extractor = DHIS2Extractor(dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_REPLACE") + output_dir = tmp_path + period = "202501" + filename = "test_extract.parquet" + + # First call creates the file + file_path = extractor.data_elements.download_period( + data_elements=[], org_units=[], period=period, output_dir=output_dir, filename=filename + ) + assert file_path.exists() + mtime_before = file_path.stat().st_mtime + + time.sleep(1) # Ensure the filesystem timestamp will change + + # Patch current_run.log_info to capture log messages + with patch.object(extractor.logger, "info") as mock_log: + # Second call should replace the file and log the replacement + extractor.data_elements.download_period( + data_elements=[], org_units=[], period=period, output_dir=output_dir, filename=filename + ) + mtime_after = file_path.stat().st_mtime + # Check that the log message about replacing the extract was called + found = any("Replacing extract for period 202501" in str(call.args[0]) for call in mock_log.call_args_list) + assert found, "Expected log message about replacing extract not found" + # Check that the file was actually replaced (mtime changed) + assert mtime_after > mtime_before, "File was not actually replaced" + + +def test_extract_download_new_file_exists(tmp_path): # noqa: ANN001 + """Test DOWNLOAD_NEW mode, creates a new file if it does not exist, and skips if it does.""" + extractor = DHIS2Extractor(dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_NEW", return_existing_file=True) + filename = "test_extract_202501.parquet" + + # First call: file is created + result_new_path = extractor.data_elements.download_period( + data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + ) + assert result_new_path.exists() + assert result_new_path.name == filename + + # Second call: should skip and log the skip message + with patch.object(extractor.logger, "info") as mock_log: + result_path = extractor.data_elements.download_period( + data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + ) + assert result_path == result_new_path + found = any( + "Extract for period 202501 already exists, download skipped." in str(call.args[0]) + for call in mock_log.call_args_list + ) + assert found, "Expected log message about skipping extract not found" + + +def test_extract_download_new_return_existing_file(tmp_path): # noqa: ANN001 + """Test DOWNLOAD_NEW mode with return_existing_file True and False.""" + filename = "test_extract_202501.parquet" + + # True: should return the file path if it exists + extractor_true = DHIS2Extractor( + dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_NEW", return_existing_file=True + ) + # Create the file + path_true = extractor_true.data_elements.download_period( + data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + ) + # Second call: should return the same file path + result_true = extractor_true.data_elements.download_period( + data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + ) + assert result_true == path_true + + # False: should return None if the file exists + extractor_false = DHIS2Extractor( + dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_NEW", return_existing_file=False + ) + # Create the file + _ = extractor_false.data_elements.download_period( + data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + ) + # Second call: should return None + result_false = extractor_false.data_elements.download_period( + data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + ) + assert result_false is None + + +def test_extract_get_data_elements_with_indicator_extractor(): + """Test that we can retrieve data elements using the indicators extractor. + + Passing valid data element ids to the indicators parameter and including + the `include_cocs=True` flag should allow us to retrieve data elements with the indicators endpoint. + """ + result = DHIS2Extractor(dhis2_client=MockDHIS2Client()).indicators._retrieve_data( + indicators=["DATAELEMENT1", "DATAELEMENT2", "DATAELEMENT3"], + org_units=[], + period="202501", + include_cocs=True, # Include category option combo in the response + ) + + assert result.shape == (3, 9) + assert result.columns == [ + "dataType", + "dx", + "period", + "orgUnit", + "categoryOptionCombo", + "attributeOptionCombo", + "rateMetric", + "domainType", + "value", + ] + assert result["dataType"].unique().to_list() == ["INDICATOR"] + assert result["dx"].to_list() == ["DATAELEMENT1", "DATAELEMENT2", "DATAELEMENT3"] + assert result["categoryOptionCombo"].to_list() == ["COC001", "COC002", "COC003"] diff --git a/d2d_development/tests/test_push.py b/d2d_development/tests/test_push.py new file mode 100644 index 0000000..06e7e7d --- /dev/null +++ b/d2d_development/tests/test_push.py @@ -0,0 +1,463 @@ +from unittest.mock import patch + +import polars as pl +import pytest + +from d2d_development.extract import DHIS2Extractor +from d2d_development.push import DHIS2Pusher, PusherError +from tests.mock_dhis2_get import MockDHIS2Client +from tests.mock_dhis2_post import ( + MOCK_DHIS2_ERROR_409_RESPONSE_AOC, + MOCK_DHIS2_ERROR_409_RESPONSE_COC, + MOCK_DHIS2_ERROR_409_RESPONSE_DE, + MOCK_DHIS2_ERROR_409_RESPONSE_ORG_UNITS, + MOCK_DHIS2_ERROR_409_RESPONSE_PERIOD, + MOCK_DHIS2_ERROR_409_RESPONSE_VALUE_FORMAT, + MOCK_DHIS2_ERROR_503_RESPONSE, + MOCK_DHIS2_OK_RESPONSE, + MockDHIS2Response, +) + + +def test_push_no_data_to_push(): + """Test the push of data points to DHIS2.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + cols = ["dx", "period", "orgUnit", "categoryOptionCombo", "attributeOptionCombo", "value"] + empty_df = pl.DataFrame({col: [] for col in cols}) + with patch.object(DHIS2Pusher, "_log_message") as mock_log_message: + pusher.push_data(empty_df) + mock_log_message.assert_any_call("Input DataFrame is empty. No data to push.") + assert pusher.summary["import_counts"]["imported"] == 0 + + +def test_push_missing_mandatory_columns(): + """Test the push of data points to DHIS2.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + cols = ["period", "orgUnit", "categoryOptionCombo", "attributeOptionCombo", "value"] + empty_df = pl.DataFrame({col: [] for col in cols}) + with pytest.raises(PusherError, match=r"Input data is missing mandatory columns: dx"): + pusher.push_data(df_data=empty_df) + + +def test_push_wrong_input_type(): + """Test the push of data points to DHIS2.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + with pytest.raises(PusherError, match=r"Input data must be a pandas or polars DataFrame."): + pusher.push_data(df_data=[]) + with pytest.raises(PusherError, match=r"Input data must be a pandas or polars DataFrame."): + pusher.push_data(df_data="not a dataframe") + with pytest.raises(PusherError, match=r"Input data must be a pandas or polars DataFrame."): + pusher.push_data(df_data={}) + + +def test_push_serialize_data_point_valid(): + """Test the serialization of a DataPointModel to JSON format for DHIS2.""" + data_point = ( + DHIS2Extractor(dhis2_client=MockDHIS2Client()) + .data_elements._retrieve_data(data_elements=["AAA111"], org_units=[], period="202501") + .slice(0, 1) + ) + + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + json_payload = pusher._serialize_data_points(data_point) + + assert json_payload[0]["dataElement"] == "AAA111" + assert json_payload[0]["period"] == "202501" + assert json_payload[0]["orgUnit"] == "ORG001" + assert json_payload[0]["categoryOptionCombo"] == "CAT001" + assert json_payload[0]["attributeOptionCombo"] == "ATTR001" + assert json_payload[0]["value"] == "12" + + +def test_push_serialize_data_point_to_delete(): + """Test the serialization of a DataPointModel to delete JSON format for DHIS2.""" + data_point = ( + DHIS2Extractor(dhis2_client=MockDHIS2Client()) + .data_elements._retrieve_data(data_elements=["AAA111"], org_units=[], period="202501") + .slice(3, 1) + ) + + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + json_payload = pusher._serialize_data_points(data_point) + + assert json_payload[0]["dataElement"] == "DELETE1" + assert json_payload[0]["period"] == "202501" + assert json_payload[0]["orgUnit"] == "ORG004" + assert json_payload[0]["categoryOptionCombo"] == "CAT004" + assert json_payload[0]["attributeOptionCombo"] == "ATTR004" + assert not json_payload[0]["value"] + assert json_payload[0]["comment"] == "deleted value" + + +def test_push_classify_points(): + """Test the mapping of data elements.""" + data_points = DHIS2Extractor(dhis2_client=MockDHIS2Client()).data_elements._retrieve_data( + data_elements=["AAA111", "BBB222", "CCC333"], org_units=[], period="202501" + ) + assert isinstance(data_points, pl.DataFrame) + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + valid, to_delete, not_valid = pusher._classify_data_points(data_points) + + # Verify no overlaps and all rows accounted for + assert len(valid) + len(to_delete) + len(not_valid) == len(data_points), ( + "Row count mismatch! Check for overlaps or missing rows." + ) + assert len(valid) == 3, "Expected 3 valid data points." + assert len(to_delete) == 1, "Expected 1 data point marked for deletion" + assert len(not_valid) == 5, "Expected 4 invalid data points." + + +def test_push_log_invalid_data_points(): + """Test the logging of invalid data points.""" + data_points = ( + DHIS2Extractor(dhis2_client=MockDHIS2Client()) + .data_elements._retrieve_data(data_elements=[], org_units=[], period="202501") + .slice(4, 4) # Select invalid data points (rows 4 to 7) for testing + ) + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + _, _, not_valid = pusher._classify_data_points(data_points) + + with patch.object(pusher, "_log_message") as mock_log_message: + pusher._log_ignored_or_na(not_valid) + assert mock_log_message.call_count == 5, "Expected a log message for each invalid data point." + for idx, call in enumerate(mock_log_message.call_args_list): + if idx == 0: + log_message = call.args[0] + assert "4 data points will be ignored" in log_message, f"Unexpected log message: {log_message}" + else: + log_message = call.args[0] + assert f"Data point ignored: dx=INVALID{idx}" in log_message, f"Unexpected log message: {log_message}" + + +def test_push_data_point(): + """Test the push of data points to DHIS2.""" + # 1 valid datapoint + data_points = ( + DHIS2Extractor(dhis2_client=MockDHIS2Client()) + .data_elements._retrieve_data(data_elements=["AAA111"], org_units=[], period="202501") + .slice(0, 1) + ) + + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + # MOCK_DHIS2_OK_RESPONSE was manually manufactured to simulate a successful import response from DHIS2 for tests + with patch.object(pusher.dhis2_client.api.session, "post", return_value=MockDHIS2Response(MOCK_DHIS2_OK_RESPONSE)): + pusher.push_data(data_points) + assert pusher.summary["import_counts"]["imported"] == 1 + assert pusher.summary["import_counts"]["ignored"] == 0 + assert pusher.summary["import_counts"]["updated"] == 0 + assert pusher.summary["import_counts"]["deleted"] == 0 + assert pusher.summary["import_options"] == { + "importStrategy": "CREATE_AND_UPDATE", + "dryRun": True, + "preheatCache": True, + "skipAudit": True, + } + + +def test_push_data_points_connection_error(): + """Test the error handling of error 503 to DHIS2.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + with patch.object( + pusher.dhis2_client.api.session, + "post", + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_503_RESPONSE, status_code=503), + ): + with pytest.raises(PusherError, match=r"Server error: Service temporarily unavailable"): + pusher._push_data_points([{"dummy_datapoint": "1"}]) + # After the exception, check the summary + assert len(pusher.summary["ERRORS"]) == 1 + assert pusher.summary["ERRORS"][0]["message"] == "Server error: Service temporarily unavailable" + assert pusher.summary["ERRORS"][0]["server_error_code"] == "503" + + +def test_push_data_points_data_element_error(): + """Test the error handling push of data points with wrong data elements.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + + # NOTE: This fake input is just to pass validation and + # match the information manufactured in the response + invalid_data_points = [ + { + "dataElement": "VALID", + "period": "202501", + "orgUnit": "ORG001", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "INVALID_1", + "period": "202501", + "orgUnit": "ORG001", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "INVALID_2", + "period": "202501", + "orgUnit": "ORG001", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + ] + + # MOCK_DHIS2_ERROR_409_RESPONSE_DE was manually manufactured to simulate a 409 Conflict from DHIS2. + with patch.object( + pusher.dhis2_client.api.session, + "post", + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_DE, status_code=409), + ): + pusher._push_data_points(invalid_data_points) # access private method for error handling testing + assert pusher.summary["import_counts"]["imported"] == 1 + assert pusher.summary["import_counts"]["updated"] == 0 + assert pusher.summary["import_counts"]["ignored"] == 2 + assert pusher.summary["import_counts"]["deleted"] == 0 + assert len(pusher.summary["ERRORS"]) == 2 + assert pusher.summary["ERRORS"][0]["object"] == "INVALID_1" + assert pusher.summary["ERRORS"][1]["object"] == "INVALID_2" + + +def test_push_data_points_org_unit_error(): + """Test the error handling push of data points with wrong org units.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + + # NOTE: This fake input is just to pass validation and + # match the information manufactured in the response + invalid_data_points = [ + { + "dataElement": "VALID", + "period": "202501", + "orgUnit": "ORG001", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "VALID", + "period": "202501", + "orgUnit": "INVALID_1", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "VALID", + "period": "202501", + "orgUnit": "INVALID_2", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + ] + + # MOCK_DHIS2_ERROR_409_RESPONSE_ORG_UNITS was manually manufactured to simulate a Conflict from DHIS2. + with patch.object( + pusher.dhis2_client.api.session, + "post", + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_ORG_UNITS, status_code=409), + ): + pusher._push_data_points(invalid_data_points) # access private method for error handling testing + assert pusher.summary["import_counts"]["imported"] == 1 + assert pusher.summary["import_counts"]["updated"] == 0 + assert pusher.summary["import_counts"]["ignored"] == 2 + assert pusher.summary["import_counts"]["deleted"] == 0 + assert len(pusher.summary["ERRORS"]) == 2 + assert pusher.summary["ERRORS"][0]["object"] == "INVALID_1_OU" + assert pusher.summary["ERRORS"][1]["object"] == "INVALID_2_OU" + + +def test_push_data_points_period_error(): + """Test the error handling push of data points with wrong periods.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + + # NOTE: This fake input is just to pass validation and + # match the information manufactured in the response + invalid_data_points = [ + { + "dataElement": "VALID", + "period": "202501", + "orgUnit": "ORG001", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "VALID", + "period": "INVALID_PERIOD_1", + "orgUnit": "VALID_OU", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "VALID", + "period": "INVALID_PERIOD_2", + "orgUnit": "VALID_OU", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + ] + + # MOCK_DHIS2_ERROR_409_RESPONSE_PERIOD was manually manufactured to simulate a Conflict from DHIS2. + with patch.object( + pusher.dhis2_client.api.session, + "post", + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_PERIOD, status_code=409), + ): + pusher._push_data_points(invalid_data_points) # access private method for error handling testing + assert pusher.summary["import_counts"]["imported"] == 1 + assert pusher.summary["import_counts"]["updated"] == 0 + assert pusher.summary["import_counts"]["ignored"] == 2 + assert pusher.summary["import_counts"]["deleted"] == 0 + assert len(pusher.summary["ERRORS"]) == 2 + assert pusher.summary["ERRORS"][0]["object"] == "INVALID_PERIOD_1" + assert pusher.summary["ERRORS"][1]["object"] == "INVALID_PERIOD_2" + + +def test_push_data_points_coc_error(): + """Test the error handling push of data points with wrong COC.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + + # NOTE: This fake input is just to pass validation and + # match the information manufactured in the response + invalid_data_points = [ + { + "dataElement": "VALID1", + "period": "202501", + "orgUnit": "ORG001", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "VALID2", + "period": "202501", + "orgUnit": "ORG002", + "categoryOptionCombo": "INVALID_COC_1", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "VALID3", + "period": "202501", + "orgUnit": "ORG003", + "categoryOptionCombo": "INVALID_COC_2", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + ] + + # MOCK_DHIS2_ERROR_409_RESPONSE_COC was manually manufactured to simulate a Conflict from DHIS2. + with patch.object( + pusher.dhis2_client.api.session, + "post", + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_COC, status_code=409), + ): + pusher._push_data_points(invalid_data_points) # access private method for error handling testing + assert pusher.summary["import_counts"]["imported"] == 1 + assert pusher.summary["import_counts"]["updated"] == 0 + assert pusher.summary["import_counts"]["ignored"] == 2 + assert pusher.summary["import_counts"]["deleted"] == 0 + assert len(pusher.summary["ERRORS"]) == 2 + assert pusher.summary["ERRORS"][0]["object"] == "INVALID_COC_1" + assert pusher.summary["ERRORS"][1]["object"] == "INVALID_COC_2" + + +def test_push_data_points_aoc_error(): + """Test the error handling push of data points with wrong AOC.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + + # NOTE: This fake input is just to pass validation and + # match the information manufactured in the response + invalid_data_points = [ + { + "dataElement": "VALID1", + "period": "202501", + "orgUnit": "ORG001", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "VALID2", + "period": "202501", + "orgUnit": "ORG002", + "categoryOptionCombo": "INVALID_AOC_1", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "VALID3", + "period": "202501", + "orgUnit": "ORG003", + "categoryOptionCombo": "INVALID_AOC_2", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + ] + + # MOCK_DHIS2_ERROR_409_RESPONSE_AOC was manually manufactured to simulate a Conflict from DHIS2. + with patch.object( + pusher.dhis2_client.api.session, + "post", + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_AOC, status_code=409), + ): + pusher._push_data_points(invalid_data_points) # access private method for error handling testing + assert pusher.summary["import_counts"]["imported"] == 1 + assert pusher.summary["import_counts"]["updated"] == 0 + assert pusher.summary["import_counts"]["ignored"] == 2 + assert pusher.summary["import_counts"]["deleted"] == 0 + assert len(pusher.summary["ERRORS"]) == 2 + assert pusher.summary["ERRORS"][0]["object"] == "INVALID_AOC_1" + assert pusher.summary["ERRORS"][1]["object"] == "INVALID_AOC_2" + + +def test_push_data_points_value_format_error(): + """Test the error handling push of data points with value not numeric.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + + # NOTE: This fake input is just to pass validation and + # match the information manufactured in the response + invalid_data_points = [ + { + "dataElement": "VALID1", + "period": "202501", + "orgUnit": "ORG001", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + { + "dataElement": "VALID2", + "period": "202501", + "orgUnit": "ORG002", + "categoryOptionCombo": "CAT002", + "attributeOptionCombo": "ATTR002", + "value": "0.0000e15", # Non numeric format for DHIS2 API + }, + { + "dataElement": "VALID3", + "period": "202501", + "orgUnit": "ORG003", + "categoryOptionCombo": "CAT003", + "attributeOptionCombo": "ATTR003", + "value": "1", + }, + ] + + # MOCK_DHIS2_ERROR_409_RESPONSE_VALUE_FORMAT was manually manufactured to simulate a Conflict from DHIS2. + with patch.object( + pusher.dhis2_client.api.session, + "post", + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_VALUE_FORMAT, status_code=409), + ): + pusher._push_data_points(invalid_data_points) # access private method for error handling testing + assert pusher.summary["import_counts"]["imported"] == 2 + assert pusher.summary["import_counts"]["updated"] == 0 + assert pusher.summary["import_counts"]["ignored"] == 1 + assert pusher.summary["import_counts"]["deleted"] == 0 + assert len(pusher.summary["ERRORS"]) == 1 + assert pusher.summary["ERRORS"][0]["object"] == "VALID2" diff --git a/d2d_development/tests/test_utils.py b/d2d_development/tests/test_utils.py new file mode 100644 index 0000000..37356c3 --- /dev/null +++ b/d2d_development/tests/test_utils.py @@ -0,0 +1,113 @@ +import logging +from pathlib import Path +from unittest.mock import Mock, patch + +import pandas as pd +import polars as pl +import pytest + +from d2d_development.exceptions import ExtractorError +from d2d_development.utils import log_message, save_to_parquet + + +class CustomError(Exception): + """Custom exception for testing invalid logging levels.""" + + pass + + +def test_log_message_info(): + """Test that log_message logs info messages correctly.""" + logger = Mock(spec=logging.Logger) + with patch("d2d_development.utils.current_run") as mock_run: + log_message(logger, "msg", level="info") + logger.info.assert_called_once_with("msg") + mock_run.log_info.assert_called_once_with("msg") + + +def test_log_message_warning(): + """Test that log_message logs warning messages correctly.""" + logger = Mock(spec=logging.Logger) + with patch("d2d_development.utils.current_run") as mock_run: + log_message(logger, "warn", level="warning") + logger.warning.assert_called_once_with("warn") + mock_run.log_warning.assert_called_once_with("warn") + + +def test_log_message_error(): + """Test that log_message logs error messages correctly, including error details.""" + logger = Mock(spec=logging.Logger) + with patch("d2d_development.utils.current_run") as mock_run: + log_message(logger, "err", error_details="details", level="error") + logger.error.assert_called_once_with("err Details: details") + mock_run.log_error.assert_called_once_with("err") + + +def test_log_message_no_current_run(): + """Test that log_message works even if current_run is not available.""" + logger = Mock(spec=logging.Logger) + with patch("d2d_development.utils.current_run", None): + log_message(logger, "msg", level="info") + logger.info.assert_called_once_with("msg") + + +def test_log_message_log_current_run_false(): + """Test that log_message does not log to current_run when log_current_run is False.""" + logger = Mock(spec=logging.Logger) + with patch("d2d_development.utils.current_run") as mock_run: + log_message(logger, "msg", level="info", log_current_run=False) + logger.info.assert_called_once_with("msg") + mock_run.log_info.assert_not_called() + + +def test_log_message_invalid_level(): + """Test that log_message raises the specified exception for invalid logging levels.""" + logger = Mock(spec=logging.Logger) + with pytest.raises(CustomError): + log_message(logger, "bad", level="bad", exception_class=CustomError) + + +def test_save_polars_dataframe(tmp_path: Path): + """Test saving a Polars DataFrame to Parquet.""" + df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) + file = tmp_path / "test.parquet" + save_to_parquet(df, file) + assert file.exists() + + +def test_save_pandas_dataframe(tmp_path: Path): + """Test saving a Pandas DataFrame to Parquet.""" + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + file = tmp_path / "test_pd.parquet" + save_to_parquet(df, file) + assert file.exists() + + +def test_invalid_type_raises(tmp_path: Path): + """Test that passing an invalid type raises an ExtractorError.""" + file = tmp_path / "fail.parquet" + with pytest.raises(ExtractorError): + save_to_parquet([1, 2, 3], file) + + +def test_overwrite_file(tmp_path: Path): + """Test that saving to an existing file overwrites it.""" + df1 = pd.DataFrame({"a": [1]}) + df2 = pd.DataFrame({"a": [2]}) + file = tmp_path / "overwrite.parquet" + save_to_parquet(df1, file) + save_to_parquet(df2, file) + result = pd.read_parquet(file) + assert result["a"].iloc[0] == 2 + + +def test_write_exception_cleanup(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): + """Test that if writing to Parquet raises an exception, no temp files are left behind.""" + df = pd.DataFrame({"a": [1]}) + file = tmp_path / "fail.parquet" + # Patch to_parquet to raise + monkeypatch.setattr(df, "to_parquet", lambda *a, **k: (_ for _ in ()).throw(Exception("fail"))) + with pytest.raises(ExtractorError): + save_to_parquet(df, file) + # Check no temp files left + assert not any(tmp_path.glob("*.parquet*")) From ff867092814c3caa57aafc76660f2b61173a3c74 Mon Sep 17 00:00:00 2001 From: EMontandon Date: Thu, 26 Mar 2026 18:00:50 +0100 Subject: [PATCH 2/8] fix(): pyproject to root + readme + ruff format --- d2d_development/README.md | 96 +++++++++++++++++++ d2d_development/d2d_development/push.py | 9 +- d2d_development/pyproject.toml | 1 + d2d_development/tests/mock_dhis2_get.py | 13 ++- d2d_development/tests/test_data_point.py | 14 ++- d2d_development/tests/test_extract.py | 110 ++++++++++++++++----- d2d_development/tests/test_push.py | 116 +++++++++++++++++------ d2d_development/tests/test_utils.py | 4 +- pyproject.toml | 70 ++++++++++++++ 9 files changed, 370 insertions(+), 63 deletions(-) create mode 100644 pyproject.toml diff --git a/d2d_development/README.md b/d2d_development/README.md index 3d0277a..962394b 100644 --- a/d2d_development/README.md +++ b/d2d_development/README.md @@ -9,3 +9,99 @@ Install this library on its own: ```bash pip install git+https://github.com/BLSQ/openhexa-ds-developments.git#subdirectory=d2d_development ``` + +## Main Classes + +### DHIS2Extractor + +**Description:** +Main class to extract data from DHIS2. It provides unified handlers for extracting data elements, indicators, and reporting rates, saving them to disk in a standardized format. + + +**Configuration Parameters:** +When initializing `DHIS2Extractor`, you can configure the following parameters: + +- `dhis2_client` (required): The DHIS2 client instance. +- `download_mode`: Controls how files are saved when extracting data. Use `"DOWNLOAD_REPLACE"` (default) to always overwrite files, or `"DOWNLOAD_NEW"` to skip downloading if the file already exists. +- `return_existing_file`: If `True` and using `DOWNLOAD_NEW`, returns the path to the existing file instead of `None` when a file already exists (default: `False`). +- `logger`: Optional custom logger instance. + +Example: +```python +extractor = DHIS2Extractor(dhis2_client, download_mode="DOWNLOAD_NEW", return_existing_file=True) +``` + +**Usage Example:** +```python +from d2d_development.extract import DHIS2Extractor +from openhexa.sdk import workspace +from openhexa.toolbox.dhis2 import DHIS2 +from pathlib import Path + +dhis2_client = DHIS2(workspace.get_connection("dhis2-connection")) +extractor = DHIS2Extractor(dhis2_client, download_mode="DOWNLOAD_REPLACE") + +# Extract several periods of data elements +for period in ["202401", "202402", "202403"]: + extractor.data_elements.download_period( + data_elements=["de1", "de2"], + org_units=["ou1", "ou2"], + period=period, + output_dir=Path("/output") + ) +# Extract one period of indicators +extractor.indicators.download_period( + indicators=["ind1"], + org_units=["ou1"], + period="202401", + output_dir=Path("/tmp") +) +# Extract one period of reporting rates +extractor.reporting_rates.download_period( + reporting_rates=["rr1"], + org_units=["ou1"], + period="202401", + output_dir=Path("/tmp") +) +``` + +### DHIS2Pusher + +**Description:** +Main class to handle pushing data to DHIS2. It validates and pushes formatted data (pandas or polars DataFrame) to a DHIS2 instance. + +**Configuration Parameters:** +When initializing `DHIS2Pusher`, you can configure the following parameters: + +- `dhis2_client` (required): The DHIS2 client instance. +- `import_strategy`: Strategy flag passed to the DHIS2 API for data import. Accepts "CREATE", "UPDATE", or "CREATE_AND_UPDATE" (default: "CREATE_AND_UPDATE"). This only controls how the DHIS2 server processes the data; it does not affect client-side logic. +- `dry_run`: If `True`, simulates the push without making changes on the server (default: `True`). +- `max_post`: Maximum number of data points per POST request (default: `500`). +- `logging_interval`: Log progress every N data points (default: `50000`). +- `logger`: Optional custom logger instance. + +**Usage Example:** +```python +from d2d_development.push import DHIS2Pusher +from openhexa.sdk import workspace +from openhexa.toolbox.dhis2 import DHIS2 +import polars as pl + +dhis2_client = DHIS2(workspace.get_connection("dhis2-connection")) +pusher = DHIS2Pusher( + dhis2_client, + import_strategy="CREATE_AND_UPDATE", # or "CREATE", "UPDATE" + dry_run=False, + max_post=1000, + logging_interval=10000, +) + +df = pl.DataFrame({ + "dx": ["de1"], + "period": ["202401"], + "orgUnit": ["ou1"], + "categoryOptionCombo": ["coc"], + "attributeOptionCombo": ["aoc"], + "value": [123]}) +pusher.push_data(df) +``` diff --git a/d2d_development/d2d_development/push.py b/d2d_development/d2d_development/push.py index 766d415..c6286b2 100644 --- a/d2d_development/d2d_development/push.py +++ b/d2d_development/d2d_development/push.py @@ -21,7 +21,6 @@ def __init__( dry_run: bool = True, max_post: int = 500, logging_interval: int = 50000, - mandatory_fields: list[str] | None = None, logger: logging.Logger | None = None, ): self.dhis2_client = dhis2_client @@ -29,11 +28,7 @@ def __init__( if import_strategy not in {"CREATE", "UPDATE", "CREATE_AND_UPDATE"}: raise PusherError("Invalid import strategy (use 'CREATE', 'UPDATE' or 'CREATE_AND_UPDATE')") - if mandatory_fields is None: - self.mandatory_fields = ["dx", "period", "orgUnit", "categoryOptionCombo", "attributeOptionCombo", "value"] - else: - self.mandatory_fields = mandatory_fields - + self.mandatory_fields = ["dx", "period", "orgUnit", "categoryOptionCombo", "attributeOptionCombo", "value"] self.import_strategy = import_strategy self.dry_run = dry_run self.max_post = max_post @@ -185,7 +180,7 @@ def _log_summary_errors(self): else: self._log_message(f"Logging {len(errors)} error(s) from import summary.", level="error") for i_e, error in enumerate(errors, start=1): - self._log_message(f"Error response {i_e}: {error}", level="error") + self._log_message(f"Error response {i_e}: {error}", log_current_run=False, level="error") def _post(self, chunk: list[dict]) -> requests.Response: """Send a POST request to DHIS2 for a chunk of data values. diff --git a/d2d_development/pyproject.toml b/d2d_development/pyproject.toml index eb5bafa..99e6eb8 100644 --- a/d2d_development/pyproject.toml +++ b/d2d_development/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "pandas>=2.2.0", "polars>=1.0.0", "packaging>=23.0", + "pyarrow>=10.0.0", ] [project.optional-dependencies] diff --git a/d2d_development/tests/mock_dhis2_get.py b/d2d_development/tests/mock_dhis2_get.py index 0c18c7f..48df8ec 100644 --- a/d2d_development/tests/mock_dhis2_get.py +++ b/d2d_development/tests/mock_dhis2_get.py @@ -1,7 +1,9 @@ class MockDataValueSets: """Mock class to simulate DHIS2 DataValueSets API responses for testing purposes.""" - def get(self, data_elements=None, periods=None, org_units=None, last_updated=None) -> list[dict]: # noqa: ANN001 + def get( + self, data_elements=None, periods=None, org_units=None, last_updated=None + ) -> list[dict]: # noqa: ANN001 """Simulate the retrieval of data values from DHIS2 based on the provided parameters. Returns @@ -135,7 +137,14 @@ def get(self, data_elements=None, periods=None, org_units=None, last_updated=Non class MockAnalytics: """Mock class to simulate DHIS2 Analytics API responses for testing purposes.""" - def get(self, indicators=None, data_elements=None, periods=None, org_units=None, include_cocs=False) -> list[dict]: # noqa: ANN001 + def get( + self, + indicators=None, + data_elements=None, + periods=None, + org_units=None, + include_cocs=False, + ) -> list[dict]: # noqa: ANN001 """Simulate the retrieval of analytics data from DHIS2 based on the provided parameters. Returns diff --git a/d2d_development/tests/test_data_point.py b/d2d_development/tests/test_data_point.py index 824ba02..fe78919 100644 --- a/d2d_development/tests/test_data_point.py +++ b/d2d_development/tests/test_data_point.py @@ -39,8 +39,13 @@ def test_data_point_model_to_json(): assert payload["dataElement"] == data_elements[0]["dataElement"].item() assert payload["period"] == data_elements[0]["period"].item() assert payload["orgUnit"] == data_elements[0]["orgUnit"].item() - assert payload["categoryOptionCombo"] == data_elements[0]["categoryOptionCombo"].item() - assert payload["attributeOptionCombo"] == data_elements[0]["attributeOptionCombo"].item() + assert ( + payload["categoryOptionCombo"] == data_elements[0]["categoryOptionCombo"].item() + ) + assert ( + payload["attributeOptionCombo"] + == data_elements[0]["attributeOptionCombo"].item() + ) assert payload["value"] == data_elements[0]["value"].item() @@ -50,7 +55,10 @@ def test_data_point_model_to_json_delete(): # Set third datapoint to value None to simulate a deleted value data_elements = data_elements.with_columns( - pl.when(pl.arange(0, data_elements.height) == 2).then(None).otherwise(pl.col("value")).alias("value") + pl.when(pl.arange(0, data_elements.height) == 2) + .then(None) + .otherwise(pl.col("value")) + .alias("value") ) points_list = [ DataPointModel( diff --git a/d2d_development/tests/test_extract.py b/d2d_development/tests/test_extract.py index 8fd1175..62b7d70 100644 --- a/d2d_development/tests/test_extract.py +++ b/d2d_development/tests/test_extract.py @@ -9,9 +9,9 @@ def test_extract_map_data_elements(): """Test the mapping of data elements.""" - result = DHIS2Extractor(dhis2_client=MockDHIS2Client()).data_elements._retrieve_data( - data_elements=[], org_units=[], period="202501" - ) + result = DHIS2Extractor( + dhis2_client=MockDHIS2Client() + ).data_elements._retrieve_data(data_elements=[], org_units=[], period="202501") assert isinstance(result, pl.DataFrame) assert result.shape == (9, 9) assert result.columns == [ @@ -68,8 +68,14 @@ def test_extract_map_data_elements(): def test_extract_map_reporting_rates(): """Test the mapping of reporting rates.""" - result = DHIS2Extractor(dhis2_client=MockDHIS2Client()).reporting_rates._retrieve_data( - reporting_rates=["AAA111.REPORTING_RATE", "BBB222.EXPECTED_REPORTS", "CCC333.REPORTING_RATE"], + result = DHIS2Extractor( + dhis2_client=MockDHIS2Client() + ).reporting_rates._retrieve_data( + reporting_rates=[ + "AAA111.REPORTING_RATE", + "BBB222.EXPECTED_REPORTS", + "CCC333.REPORTING_RATE", + ], org_units=[], period="202409", ) @@ -92,7 +98,11 @@ def test_extract_map_reporting_rates(): assert result["orgUnit"].to_list() == ["OU001", "OU002", "OU003"] assert result["categoryOptionCombo"].to_list() == [None, None, None] assert result["attributeOptionCombo"].to_list() == [None, None, None] - assert result["rateMetric"].to_list() == ["REPORTING_RATE", "EXPECTED_REPORTS", "REPORTING_RATE"] + assert result["rateMetric"].to_list() == [ + "REPORTING_RATE", + "EXPECTED_REPORTS", + "REPORTING_RATE", + ] assert result["domainType"].to_list() == ["AGGREGATED", "AGGREGATED", "AGGREGATED"] assert result["value"].to_list() == ["100", "0", "100"] @@ -100,7 +110,9 @@ def test_extract_map_reporting_rates(): def test_extract_map_indicator(): """Test the mapping of indicators.""" result = DHIS2Extractor(dhis2_client=MockDHIS2Client()).indicators._retrieve_data( - indicators=["INDICATOR1", "INDICATOR2", "INDICATOR3"], org_units=[], period="202501" + indicators=["INDICATOR1", "INDICATOR2", "INDICATOR3"], + org_units=[], + period="202501", ) assert isinstance(result, pl.DataFrame) assert result.shape == (3, 9) @@ -128,12 +140,18 @@ def test_extract_map_indicator(): def test_extract_download_replace_no_file(tmp_path): # noqa: ANN001 """Test DOWNLOAD_REPLACE mode, downloads and saves data to a Parquet file.""" - extractor = DHIS2Extractor(dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_REPLACE") + extractor = DHIS2Extractor( + dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_REPLACE" + ) filename = "test_extract_202501.parquet" # Call download_period result_path = extractor.data_elements.download_period( - data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + data_elements=[], + org_units=[], + period="202501", + output_dir=tmp_path, + filename=filename, ) # Assert file is created @@ -143,14 +161,20 @@ def test_extract_download_replace_no_file(tmp_path): # noqa: ANN001 def test_download_replace_replaces_file_and_logs(tmp_path): # noqa: ANN001 """Test DOWNLOAD_REPLACE mode, replaces the file if it already exists and logs the replacement.""" - extractor = DHIS2Extractor(dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_REPLACE") + extractor = DHIS2Extractor( + dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_REPLACE" + ) output_dir = tmp_path period = "202501" filename = "test_extract.parquet" # First call creates the file file_path = extractor.data_elements.download_period( - data_elements=[], org_units=[], period=period, output_dir=output_dir, filename=filename + data_elements=[], + org_units=[], + period=period, + output_dir=output_dir, + filename=filename, ) assert file_path.exists() mtime_before = file_path.stat().st_mtime @@ -161,11 +185,18 @@ def test_download_replace_replaces_file_and_logs(tmp_path): # noqa: ANN001 with patch.object(extractor.logger, "info") as mock_log: # Second call should replace the file and log the replacement extractor.data_elements.download_period( - data_elements=[], org_units=[], period=period, output_dir=output_dir, filename=filename + data_elements=[], + org_units=[], + period=period, + output_dir=output_dir, + filename=filename, ) mtime_after = file_path.stat().st_mtime # Check that the log message about replacing the extract was called - found = any("Replacing extract for period 202501" in str(call.args[0]) for call in mock_log.call_args_list) + found = any( + "Replacing extract for period 202501" in str(call.args[0]) + for call in mock_log.call_args_list + ) assert found, "Expected log message about replacing extract not found" # Check that the file was actually replaced (mtime changed) assert mtime_after > mtime_before, "File was not actually replaced" @@ -173,12 +204,20 @@ def test_download_replace_replaces_file_and_logs(tmp_path): # noqa: ANN001 def test_extract_download_new_file_exists(tmp_path): # noqa: ANN001 """Test DOWNLOAD_NEW mode, creates a new file if it does not exist, and skips if it does.""" - extractor = DHIS2Extractor(dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_NEW", return_existing_file=True) + extractor = DHIS2Extractor( + dhis2_client=MockDHIS2Client(), + download_mode="DOWNLOAD_NEW", + return_existing_file=True, + ) filename = "test_extract_202501.parquet" # First call: file is created result_new_path = extractor.data_elements.download_period( - data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + data_elements=[], + org_units=[], + period="202501", + output_dir=tmp_path, + filename=filename, ) assert result_new_path.exists() assert result_new_path.name == filename @@ -186,11 +225,16 @@ def test_extract_download_new_file_exists(tmp_path): # noqa: ANN001 # Second call: should skip and log the skip message with patch.object(extractor.logger, "info") as mock_log: result_path = extractor.data_elements.download_period( - data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + data_elements=[], + org_units=[], + period="202501", + output_dir=tmp_path, + filename=filename, ) assert result_path == result_new_path found = any( - "Extract for period 202501 already exists, download skipped." in str(call.args[0]) + "Extract for period 202501 already exists, download skipped." + in str(call.args[0]) for call in mock_log.call_args_list ) assert found, "Expected log message about skipping extract not found" @@ -202,29 +246,49 @@ def test_extract_download_new_return_existing_file(tmp_path): # noqa: ANN001 # True: should return the file path if it exists extractor_true = DHIS2Extractor( - dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_NEW", return_existing_file=True + dhis2_client=MockDHIS2Client(), + download_mode="DOWNLOAD_NEW", + return_existing_file=True, ) # Create the file path_true = extractor_true.data_elements.download_period( - data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + data_elements=[], + org_units=[], + period="202501", + output_dir=tmp_path, + filename=filename, ) # Second call: should return the same file path result_true = extractor_true.data_elements.download_period( - data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + data_elements=[], + org_units=[], + period="202501", + output_dir=tmp_path, + filename=filename, ) assert result_true == path_true # False: should return None if the file exists extractor_false = DHIS2Extractor( - dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_NEW", return_existing_file=False + dhis2_client=MockDHIS2Client(), + download_mode="DOWNLOAD_NEW", + return_existing_file=False, ) # Create the file _ = extractor_false.data_elements.download_period( - data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + data_elements=[], + org_units=[], + period="202501", + output_dir=tmp_path, + filename=filename, ) # Second call: should return None result_false = extractor_false.data_elements.download_period( - data_elements=[], org_units=[], period="202501", output_dir=tmp_path, filename=filename + data_elements=[], + org_units=[], + period="202501", + output_dir=tmp_path, + filename=filename, ) assert result_false is None diff --git a/d2d_development/tests/test_push.py b/d2d_development/tests/test_push.py index 06e7e7d..226fe11 100644 --- a/d2d_development/tests/test_push.py +++ b/d2d_development/tests/test_push.py @@ -22,7 +22,14 @@ def test_push_no_data_to_push(): """Test the push of data points to DHIS2.""" pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) - cols = ["dx", "period", "orgUnit", "categoryOptionCombo", "attributeOptionCombo", "value"] + cols = [ + "dx", + "period", + "orgUnit", + "categoryOptionCombo", + "attributeOptionCombo", + "value", + ] empty_df = pl.DataFrame({col: [] for col in cols}) with patch.object(DHIS2Pusher, "_log_message") as mock_log_message: pusher.push_data(empty_df) @@ -35,18 +42,26 @@ def test_push_missing_mandatory_columns(): pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) cols = ["period", "orgUnit", "categoryOptionCombo", "attributeOptionCombo", "value"] empty_df = pl.DataFrame({col: [] for col in cols}) - with pytest.raises(PusherError, match=r"Input data is missing mandatory columns: dx"): + with pytest.raises( + PusherError, match=r"Input data is missing mandatory columns: dx" + ): pusher.push_data(df_data=empty_df) def test_push_wrong_input_type(): """Test the push of data points to DHIS2.""" pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) - with pytest.raises(PusherError, match=r"Input data must be a pandas or polars DataFrame."): + with pytest.raises( + PusherError, match=r"Input data must be a pandas or polars DataFrame." + ): pusher.push_data(df_data=[]) - with pytest.raises(PusherError, match=r"Input data must be a pandas or polars DataFrame."): + with pytest.raises( + PusherError, match=r"Input data must be a pandas or polars DataFrame." + ): pusher.push_data(df_data="not a dataframe") - with pytest.raises(PusherError, match=r"Input data must be a pandas or polars DataFrame."): + with pytest.raises( + PusherError, match=r"Input data must be a pandas or polars DataFrame." + ): pusher.push_data(df_data={}) @@ -54,7 +69,9 @@ def test_push_serialize_data_point_valid(): """Test the serialization of a DataPointModel to JSON format for DHIS2.""" data_point = ( DHIS2Extractor(dhis2_client=MockDHIS2Client()) - .data_elements._retrieve_data(data_elements=["AAA111"], org_units=[], period="202501") + .data_elements._retrieve_data( + data_elements=["AAA111"], org_units=[], period="202501" + ) .slice(0, 1) ) @@ -73,7 +90,9 @@ def test_push_serialize_data_point_to_delete(): """Test the serialization of a DataPointModel to delete JSON format for DHIS2.""" data_point = ( DHIS2Extractor(dhis2_client=MockDHIS2Client()) - .data_elements._retrieve_data(data_elements=["AAA111"], org_units=[], period="202501") + .data_elements._retrieve_data( + data_elements=["AAA111"], org_units=[], period="202501" + ) .slice(3, 1) ) @@ -91,7 +110,9 @@ def test_push_serialize_data_point_to_delete(): def test_push_classify_points(): """Test the mapping of data elements.""" - data_points = DHIS2Extractor(dhis2_client=MockDHIS2Client()).data_elements._retrieve_data( + data_points = DHIS2Extractor( + dhis2_client=MockDHIS2Client() + ).data_elements._retrieve_data( data_elements=["AAA111", "BBB222", "CCC333"], org_units=[], period="202501" ) assert isinstance(data_points, pl.DataFrame) @@ -119,14 +140,20 @@ def test_push_log_invalid_data_points(): with patch.object(pusher, "_log_message") as mock_log_message: pusher._log_ignored_or_na(not_valid) - assert mock_log_message.call_count == 5, "Expected a log message for each invalid data point." + assert mock_log_message.call_count == 5, ( + "Expected a log message for each invalid data point." + ) for idx, call in enumerate(mock_log_message.call_args_list): if idx == 0: log_message = call.args[0] - assert "4 data points will be ignored" in log_message, f"Unexpected log message: {log_message}" + assert "4 data points will be ignored" in log_message, ( + f"Unexpected log message: {log_message}" + ) else: log_message = call.args[0] - assert f"Data point ignored: dx=INVALID{idx}" in log_message, f"Unexpected log message: {log_message}" + assert f"Data point ignored: dx=INVALID{idx}" in log_message, ( + f"Unexpected log message: {log_message}" + ) def test_push_data_point(): @@ -134,13 +161,19 @@ def test_push_data_point(): # 1 valid datapoint data_points = ( DHIS2Extractor(dhis2_client=MockDHIS2Client()) - .data_elements._retrieve_data(data_elements=["AAA111"], org_units=[], period="202501") + .data_elements._retrieve_data( + data_elements=["AAA111"], org_units=[], period="202501" + ) .slice(0, 1) ) pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) # MOCK_DHIS2_OK_RESPONSE was manually manufactured to simulate a successful import response from DHIS2 for tests - with patch.object(pusher.dhis2_client.api.session, "post", return_value=MockDHIS2Response(MOCK_DHIS2_OK_RESPONSE)): + with patch.object( + pusher.dhis2_client.api.session, + "post", + return_value=MockDHIS2Response(MOCK_DHIS2_OK_RESPONSE), + ): pusher.push_data(data_points) assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["ignored"] == 0 @@ -162,11 +195,16 @@ def test_push_data_points_connection_error(): "post", return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_503_RESPONSE, status_code=503), ): - with pytest.raises(PusherError, match=r"Server error: Service temporarily unavailable"): + with pytest.raises( + PusherError, match=r"Server error: Service temporarily unavailable" + ): pusher._push_data_points([{"dummy_datapoint": "1"}]) # After the exception, check the summary assert len(pusher.summary["ERRORS"]) == 1 - assert pusher.summary["ERRORS"][0]["message"] == "Server error: Service temporarily unavailable" + assert ( + pusher.summary["ERRORS"][0]["message"] + == "Server error: Service temporarily unavailable" + ) assert pusher.summary["ERRORS"][0]["server_error_code"] == "503" @@ -207,9 +245,13 @@ def test_push_data_points_data_element_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_DE, status_code=409), + return_value=MockDHIS2Response( + MOCK_DHIS2_ERROR_409_RESPONSE_DE, status_code=409 + ), ): - pusher._push_data_points(invalid_data_points) # access private method for error handling testing + pusher._push_data_points( + invalid_data_points + ) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 @@ -256,9 +298,13 @@ def test_push_data_points_org_unit_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_ORG_UNITS, status_code=409), + return_value=MockDHIS2Response( + MOCK_DHIS2_ERROR_409_RESPONSE_ORG_UNITS, status_code=409 + ), ): - pusher._push_data_points(invalid_data_points) # access private method for error handling testing + pusher._push_data_points( + invalid_data_points + ) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 @@ -305,9 +351,13 @@ def test_push_data_points_period_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_PERIOD, status_code=409), + return_value=MockDHIS2Response( + MOCK_DHIS2_ERROR_409_RESPONSE_PERIOD, status_code=409 + ), ): - pusher._push_data_points(invalid_data_points) # access private method for error handling testing + pusher._push_data_points( + invalid_data_points + ) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 @@ -354,9 +404,13 @@ def test_push_data_points_coc_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_COC, status_code=409), + return_value=MockDHIS2Response( + MOCK_DHIS2_ERROR_409_RESPONSE_COC, status_code=409 + ), ): - pusher._push_data_points(invalid_data_points) # access private method for error handling testing + pusher._push_data_points( + invalid_data_points + ) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 @@ -403,9 +457,13 @@ def test_push_data_points_aoc_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_AOC, status_code=409), + return_value=MockDHIS2Response( + MOCK_DHIS2_ERROR_409_RESPONSE_AOC, status_code=409 + ), ): - pusher._push_data_points(invalid_data_points) # access private method for error handling testing + pusher._push_data_points( + invalid_data_points + ) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 @@ -452,9 +510,13 @@ def test_push_data_points_value_format_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_VALUE_FORMAT, status_code=409), + return_value=MockDHIS2Response( + MOCK_DHIS2_ERROR_409_RESPONSE_VALUE_FORMAT, status_code=409 + ), ): - pusher._push_data_points(invalid_data_points) # access private method for error handling testing + pusher._push_data_points( + invalid_data_points + ) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 2 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 1 diff --git a/d2d_development/tests/test_utils.py b/d2d_development/tests/test_utils.py index 37356c3..06fc82c 100644 --- a/d2d_development/tests/test_utils.py +++ b/d2d_development/tests/test_utils.py @@ -106,7 +106,9 @@ def test_write_exception_cleanup(tmp_path: Path, monkeypatch: pytest.MonkeyPatch df = pd.DataFrame({"a": [1]}) file = tmp_path / "fail.parquet" # Patch to_parquet to raise - monkeypatch.setattr(df, "to_parquet", lambda *a, **k: (_ for _ in ()).throw(Exception("fail"))) + monkeypatch.setattr( + df, "to_parquet", lambda *a, **k: (_ for _ in ()).throw(Exception("fail")) + ) with pytest.raises(ExtractorError): save_to_parquet(df, file) # Check no temp files left diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7650ea0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,70 @@ + +[tool.ruff] +line-length = 120 + +[tool.ruff.format] +docstring-code-format = true +docstring-code-line-length = 100 + +[tool.ruff.lint] +preview = true +select = [ + "F", # Pyflakes + "E", # pycodestyle + "I", # isort + "D", # pydocstyle + "UP", # pyupgrade + "ANN", # flake8-annotations + "B", # bugbear + "A", # flake8-builtins + "COM", # flake8-commas + "FA", # flake8-future-annotations + "PT", # flake8-pytest-style + "Q", # flake8-quotes + "RET", # flake8-return + "SIM", # flake8-simplify + "PTH", # flake8-use-pathlib + "NPY", # NumPy rules + "PD", # pandas rules + "N", # pep8-naming + "DOC", # pydoclint + "PLC", # pylint convention + "PLE", # pylint error + "PLW", # pylint warning + "RUF", # ruff specific rules +] + +ignore = [ + "D100", # Missing docstring in public module + "D104", # Missing docstring in public package + "D105", # Missing docstring in magic method + "D106", # Missing docstring in public nested class + "D107", # Missing docstring in __init__ + "D401", # First line should be in imperative mood + "D413", # Missing blank line after last section + "D203", # 1 blank line required before class docstring + "SIM108", # Use ternary operators + "SIM102", # Use a single if statement instead of nested if statements + "SIM114", # Combine `if` branches + "DOC501", # Raised exception {id} missing from docstring + "DOC502", # Raised exception is not explicitly raised: {id} + "RUF022", # `__all__` is not sorted + "RUF005", # Consider expression instead of concatenation + "RUF069", # Unreliable floating point equality comparison + "PD901", # Avoid using the generic variable name df for dataframes + "PLR0904", # Too many public methods ({methods} > {max_methods}) + "PLR0911", # Too many return statements ({returns} > {max_returns}) + "PLR0912", # Too many branches ({branches} > {max_branches}) + "PLR0913", # Too many arguments ({arguments} > {max_arguments}) + "PLR0914", # Too many local variables ({variables} > {max_variables}) + "PLR0915", # Too many statements ({statements} > {max_statements}) + "PLR0916", # Too many Boolean expressions ({expressions} > {max_expressions}) + "PLR1702", # Too many nested blocks ({blocks} > {max_blocks}), + "COM812", # Missing trailing comma +] + +[tool.ruff.lint.flake8-annotations] +allow-star-arg-any = true +mypy-init-return = true +suppress-dummy-args = true +suppress-none-returning = true \ No newline at end of file From 331609c800985fb583a7cb9baa7e02bd59e1a274 Mon Sep 17 00:00:00 2001 From: EMontandon Date: Thu, 26 Mar 2026 18:15:10 +0100 Subject: [PATCH 3/8] fix(): ruff crap again --- d2d_development/tests/mock_dhis2_get.py | 20 +++-- d2d_development/tests/test_data_point.py | 16 +--- d2d_development/tests/test_extract.py | 28 ++---- d2d_development/tests/test_push.py | 103 ++++++----------------- d2d_development/tests/test_utils.py | 5 +- 5 files changed, 52 insertions(+), 120 deletions(-) diff --git a/d2d_development/tests/mock_dhis2_get.py b/d2d_development/tests/mock_dhis2_get.py index 48df8ec..b10f767 100644 --- a/d2d_development/tests/mock_dhis2_get.py +++ b/d2d_development/tests/mock_dhis2_get.py @@ -2,8 +2,12 @@ class MockDataValueSets: """Mock class to simulate DHIS2 DataValueSets API responses for testing purposes.""" def get( - self, data_elements=None, periods=None, org_units=None, last_updated=None - ) -> list[dict]: # noqa: ANN001 + self, + data_elements: list[str] = None, # noqa: RUF013 + periods: list[str] = None, # noqa: RUF013 + org_units: list[str] = None, # noqa: RUF013 + last_updated: str = None, # noqa: RUF013 + ) -> list[dict]: """Simulate the retrieval of data values from DHIS2 based on the provided parameters. Returns @@ -139,12 +143,12 @@ class MockAnalytics: def get( self, - indicators=None, - data_elements=None, - periods=None, - org_units=None, - include_cocs=False, - ) -> list[dict]: # noqa: ANN001 + indicators: list[str] = None, # noqa: RUF013 + data_elements: list[str] = None, # noqa: RUF013 + periods: list[str] = None, # noqa: RUF013 + org_units: list[str] = None, # noqa: RUF013 + include_cocs: bool = False, + ) -> list[dict]: """Simulate the retrieval of analytics data from DHIS2 based on the provided parameters. Returns diff --git a/d2d_development/tests/test_data_point.py b/d2d_development/tests/test_data_point.py index fe78919..0c28766 100644 --- a/d2d_development/tests/test_data_point.py +++ b/d2d_development/tests/test_data_point.py @@ -1,6 +1,6 @@ import polars as pl - from d2d_development.data_models import DataPointModel + from tests.mock_dhis2_get import MockDHIS2Client @@ -39,13 +39,8 @@ def test_data_point_model_to_json(): assert payload["dataElement"] == data_elements[0]["dataElement"].item() assert payload["period"] == data_elements[0]["period"].item() assert payload["orgUnit"] == data_elements[0]["orgUnit"].item() - assert ( - payload["categoryOptionCombo"] == data_elements[0]["categoryOptionCombo"].item() - ) - assert ( - payload["attributeOptionCombo"] - == data_elements[0]["attributeOptionCombo"].item() - ) + assert payload["categoryOptionCombo"] == data_elements[0]["categoryOptionCombo"].item() + assert payload["attributeOptionCombo"] == data_elements[0]["attributeOptionCombo"].item() assert payload["value"] == data_elements[0]["value"].item() @@ -55,10 +50,7 @@ def test_data_point_model_to_json_delete(): # Set third datapoint to value None to simulate a deleted value data_elements = data_elements.with_columns( - pl.when(pl.arange(0, data_elements.height) == 2) - .then(None) - .otherwise(pl.col("value")) - .alias("value") + pl.when(pl.arange(0, data_elements.height) == 2).then(None).otherwise(pl.col("value")).alias("value") ) points_list = [ DataPointModel( diff --git a/d2d_development/tests/test_extract.py b/d2d_development/tests/test_extract.py index 62b7d70..91521b9 100644 --- a/d2d_development/tests/test_extract.py +++ b/d2d_development/tests/test_extract.py @@ -2,16 +2,16 @@ from unittest.mock import patch import polars as pl - from d2d_development.extract import DHIS2Extractor + from tests.mock_dhis2_get import MockDHIS2Client def test_extract_map_data_elements(): """Test the mapping of data elements.""" - result = DHIS2Extractor( - dhis2_client=MockDHIS2Client() - ).data_elements._retrieve_data(data_elements=[], org_units=[], period="202501") + result = DHIS2Extractor(dhis2_client=MockDHIS2Client()).data_elements._retrieve_data( + data_elements=[], org_units=[], period="202501" + ) assert isinstance(result, pl.DataFrame) assert result.shape == (9, 9) assert result.columns == [ @@ -68,9 +68,7 @@ def test_extract_map_data_elements(): def test_extract_map_reporting_rates(): """Test the mapping of reporting rates.""" - result = DHIS2Extractor( - dhis2_client=MockDHIS2Client() - ).reporting_rates._retrieve_data( + result = DHIS2Extractor(dhis2_client=MockDHIS2Client()).reporting_rates._retrieve_data( reporting_rates=[ "AAA111.REPORTING_RATE", "BBB222.EXPECTED_REPORTS", @@ -140,9 +138,7 @@ def test_extract_map_indicator(): def test_extract_download_replace_no_file(tmp_path): # noqa: ANN001 """Test DOWNLOAD_REPLACE mode, downloads and saves data to a Parquet file.""" - extractor = DHIS2Extractor( - dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_REPLACE" - ) + extractor = DHIS2Extractor(dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_REPLACE") filename = "test_extract_202501.parquet" # Call download_period @@ -161,9 +157,7 @@ def test_extract_download_replace_no_file(tmp_path): # noqa: ANN001 def test_download_replace_replaces_file_and_logs(tmp_path): # noqa: ANN001 """Test DOWNLOAD_REPLACE mode, replaces the file if it already exists and logs the replacement.""" - extractor = DHIS2Extractor( - dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_REPLACE" - ) + extractor = DHIS2Extractor(dhis2_client=MockDHIS2Client(), download_mode="DOWNLOAD_REPLACE") output_dir = tmp_path period = "202501" filename = "test_extract.parquet" @@ -193,10 +187,7 @@ def test_download_replace_replaces_file_and_logs(tmp_path): # noqa: ANN001 ) mtime_after = file_path.stat().st_mtime # Check that the log message about replacing the extract was called - found = any( - "Replacing extract for period 202501" in str(call.args[0]) - for call in mock_log.call_args_list - ) + found = any("Replacing extract for period 202501" in str(call.args[0]) for call in mock_log.call_args_list) assert found, "Expected log message about replacing extract not found" # Check that the file was actually replaced (mtime changed) assert mtime_after > mtime_before, "File was not actually replaced" @@ -233,8 +224,7 @@ def test_extract_download_new_file_exists(tmp_path): # noqa: ANN001 ) assert result_path == result_new_path found = any( - "Extract for period 202501 already exists, download skipped." - in str(call.args[0]) + "Extract for period 202501 already exists, download skipped." in str(call.args[0]) for call in mock_log.call_args_list ) assert found, "Expected log message about skipping extract not found" diff --git a/d2d_development/tests/test_push.py b/d2d_development/tests/test_push.py index 226fe11..536ca5f 100644 --- a/d2d_development/tests/test_push.py +++ b/d2d_development/tests/test_push.py @@ -2,9 +2,9 @@ import polars as pl import pytest - from d2d_development.extract import DHIS2Extractor from d2d_development.push import DHIS2Pusher, PusherError + from tests.mock_dhis2_get import MockDHIS2Client from tests.mock_dhis2_post import ( MOCK_DHIS2_ERROR_409_RESPONSE_AOC, @@ -42,26 +42,18 @@ def test_push_missing_mandatory_columns(): pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) cols = ["period", "orgUnit", "categoryOptionCombo", "attributeOptionCombo", "value"] empty_df = pl.DataFrame({col: [] for col in cols}) - with pytest.raises( - PusherError, match=r"Input data is missing mandatory columns: dx" - ): + with pytest.raises(PusherError, match=r"Input data is missing mandatory columns: dx"): pusher.push_data(df_data=empty_df) def test_push_wrong_input_type(): """Test the push of data points to DHIS2.""" pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) - with pytest.raises( - PusherError, match=r"Input data must be a pandas or polars DataFrame." - ): + with pytest.raises(PusherError, match=r"Input data must be a pandas or polars DataFrame."): pusher.push_data(df_data=[]) - with pytest.raises( - PusherError, match=r"Input data must be a pandas or polars DataFrame." - ): + with pytest.raises(PusherError, match=r"Input data must be a pandas or polars DataFrame."): pusher.push_data(df_data="not a dataframe") - with pytest.raises( - PusherError, match=r"Input data must be a pandas or polars DataFrame." - ): + with pytest.raises(PusherError, match=r"Input data must be a pandas or polars DataFrame."): pusher.push_data(df_data={}) @@ -69,9 +61,7 @@ def test_push_serialize_data_point_valid(): """Test the serialization of a DataPointModel to JSON format for DHIS2.""" data_point = ( DHIS2Extractor(dhis2_client=MockDHIS2Client()) - .data_elements._retrieve_data( - data_elements=["AAA111"], org_units=[], period="202501" - ) + .data_elements._retrieve_data(data_elements=["AAA111"], org_units=[], period="202501") .slice(0, 1) ) @@ -90,9 +80,7 @@ def test_push_serialize_data_point_to_delete(): """Test the serialization of a DataPointModel to delete JSON format for DHIS2.""" data_point = ( DHIS2Extractor(dhis2_client=MockDHIS2Client()) - .data_elements._retrieve_data( - data_elements=["AAA111"], org_units=[], period="202501" - ) + .data_elements._retrieve_data(data_elements=["AAA111"], org_units=[], period="202501") .slice(3, 1) ) @@ -110,9 +98,7 @@ def test_push_serialize_data_point_to_delete(): def test_push_classify_points(): """Test the mapping of data elements.""" - data_points = DHIS2Extractor( - dhis2_client=MockDHIS2Client() - ).data_elements._retrieve_data( + data_points = DHIS2Extractor(dhis2_client=MockDHIS2Client()).data_elements._retrieve_data( data_elements=["AAA111", "BBB222", "CCC333"], org_units=[], period="202501" ) assert isinstance(data_points, pl.DataFrame) @@ -140,20 +126,14 @@ def test_push_log_invalid_data_points(): with patch.object(pusher, "_log_message") as mock_log_message: pusher._log_ignored_or_na(not_valid) - assert mock_log_message.call_count == 5, ( - "Expected a log message for each invalid data point." - ) + assert mock_log_message.call_count == 5, "Expected a log message for each invalid data point." for idx, call in enumerate(mock_log_message.call_args_list): if idx == 0: log_message = call.args[0] - assert "4 data points will be ignored" in log_message, ( - f"Unexpected log message: {log_message}" - ) + assert "4 data points will be ignored" in log_message, f"Unexpected log message: {log_message}" else: log_message = call.args[0] - assert f"Data point ignored: dx=INVALID{idx}" in log_message, ( - f"Unexpected log message: {log_message}" - ) + assert f"Data point ignored: dx=INVALID{idx}" in log_message, f"Unexpected log message: {log_message}" def test_push_data_point(): @@ -161,9 +141,7 @@ def test_push_data_point(): # 1 valid datapoint data_points = ( DHIS2Extractor(dhis2_client=MockDHIS2Client()) - .data_elements._retrieve_data( - data_elements=["AAA111"], org_units=[], period="202501" - ) + .data_elements._retrieve_data(data_elements=["AAA111"], org_units=[], period="202501") .slice(0, 1) ) @@ -195,16 +173,11 @@ def test_push_data_points_connection_error(): "post", return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_503_RESPONSE, status_code=503), ): - with pytest.raises( - PusherError, match=r"Server error: Service temporarily unavailable" - ): + with pytest.raises(PusherError, match=r"Server error: Service temporarily unavailable"): pusher._push_data_points([{"dummy_datapoint": "1"}]) # After the exception, check the summary assert len(pusher.summary["ERRORS"]) == 1 - assert ( - pusher.summary["ERRORS"][0]["message"] - == "Server error: Service temporarily unavailable" - ) + assert pusher.summary["ERRORS"][0]["message"] == "Server error: Service temporarily unavailable" assert pusher.summary["ERRORS"][0]["server_error_code"] == "503" @@ -245,13 +218,9 @@ def test_push_data_points_data_element_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response( - MOCK_DHIS2_ERROR_409_RESPONSE_DE, status_code=409 - ), + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_DE, status_code=409), ): - pusher._push_data_points( - invalid_data_points - ) # access private method for error handling testing + pusher._push_data_points(invalid_data_points) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 @@ -298,13 +267,9 @@ def test_push_data_points_org_unit_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response( - MOCK_DHIS2_ERROR_409_RESPONSE_ORG_UNITS, status_code=409 - ), + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_ORG_UNITS, status_code=409), ): - pusher._push_data_points( - invalid_data_points - ) # access private method for error handling testing + pusher._push_data_points(invalid_data_points) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 @@ -351,13 +316,9 @@ def test_push_data_points_period_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response( - MOCK_DHIS2_ERROR_409_RESPONSE_PERIOD, status_code=409 - ), + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_PERIOD, status_code=409), ): - pusher._push_data_points( - invalid_data_points - ) # access private method for error handling testing + pusher._push_data_points(invalid_data_points) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 @@ -404,13 +365,9 @@ def test_push_data_points_coc_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response( - MOCK_DHIS2_ERROR_409_RESPONSE_COC, status_code=409 - ), + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_COC, status_code=409), ): - pusher._push_data_points( - invalid_data_points - ) # access private method for error handling testing + pusher._push_data_points(invalid_data_points) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 @@ -457,13 +414,9 @@ def test_push_data_points_aoc_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response( - MOCK_DHIS2_ERROR_409_RESPONSE_AOC, status_code=409 - ), + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_AOC, status_code=409), ): - pusher._push_data_points( - invalid_data_points - ) # access private method for error handling testing + pusher._push_data_points(invalid_data_points) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 1 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 @@ -510,13 +463,9 @@ def test_push_data_points_value_format_error(): with patch.object( pusher.dhis2_client.api.session, "post", - return_value=MockDHIS2Response( - MOCK_DHIS2_ERROR_409_RESPONSE_VALUE_FORMAT, status_code=409 - ), + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_VALUE_FORMAT, status_code=409), ): - pusher._push_data_points( - invalid_data_points - ) # access private method for error handling testing + pusher._push_data_points(invalid_data_points) # access private method for error handling testing assert pusher.summary["import_counts"]["imported"] == 2 assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 1 diff --git a/d2d_development/tests/test_utils.py b/d2d_development/tests/test_utils.py index 06fc82c..4a7cc35 100644 --- a/d2d_development/tests/test_utils.py +++ b/d2d_development/tests/test_utils.py @@ -5,7 +5,6 @@ import pandas as pd import polars as pl import pytest - from d2d_development.exceptions import ExtractorError from d2d_development.utils import log_message, save_to_parquet @@ -106,9 +105,7 @@ def test_write_exception_cleanup(tmp_path: Path, monkeypatch: pytest.MonkeyPatch df = pd.DataFrame({"a": [1]}) file = tmp_path / "fail.parquet" # Patch to_parquet to raise - monkeypatch.setattr( - df, "to_parquet", lambda *a, **k: (_ for _ in ()).throw(Exception("fail")) - ) + monkeypatch.setattr(df, "to_parquet", lambda *a, **k: (_ for _ in ()).throw(Exception("fail"))) with pytest.raises(ExtractorError): save_to_parquet(df, file) # Check no temp files left From 657fa538f3f8b56a0266754da4daf1d0c58cadd5 Mon Sep 17 00:00:00 2001 From: EMontandon Date: Fri, 27 Mar 2026 09:47:14 +0100 Subject: [PATCH 4/8] fix(test): utils tests + move ruff rules to d2d_dev only --- d2d_development/d2d_development/utils.py | 4 +- d2d_development/pyproject.toml | 68 +++++++++++++++++++++++ pyproject.toml | 70 ------------------------ 3 files changed, 71 insertions(+), 71 deletions(-) delete mode 100644 pyproject.toml diff --git a/d2d_development/d2d_development/utils.py b/d2d_development/d2d_development/utils.py index fceb493..55554fc 100644 --- a/d2d_development/d2d_development/utils.py +++ b/d2d_development/d2d_development/utils.py @@ -6,6 +6,8 @@ import polars as pl from openhexa.sdk import current_run +from d2d_development.exceptions import ExtractorError + def log_message( logger: logging.Logger, @@ -86,4 +88,4 @@ def save_to_parquet(data: pl.DataFrame | pd.DataFrame, filename: Path) -> None: # Clean up the temp file if it exists if temp_filename is not None and temp_filename.exists(): temp_filename.unlink() - raise Exception(f"Failed to save parquet file to {filename}") from e + raise ExtractorError(f"Failed to save parquet file to {filename}") from e diff --git a/d2d_development/pyproject.toml b/d2d_development/pyproject.toml index 99e6eb8..44cf072 100644 --- a/d2d_development/pyproject.toml +++ b/d2d_development/pyproject.toml @@ -29,3 +29,71 @@ testpaths = ["tests"] where = ["."] exclude = ["tests*"] +[tool.ruff] +line-length = 120 + +[tool.ruff.format] +docstring-code-format = true +docstring-code-line-length = 100 + +[tool.ruff.lint] +preview = true +select = [ + "F", # Pyflakes + "E", # pycodestyle + "I", # isort + "D", # pydocstyle + "UP", # pyupgrade + "ANN", # flake8-annotations + "B", # bugbear + "A", # flake8-builtins + "COM", # flake8-commas + "FA", # flake8-future-annotations + "PT", # flake8-pytest-style + "Q", # flake8-quotes + "RET", # flake8-return + "SIM", # flake8-simplify + "PTH", # flake8-use-pathlib + "NPY", # NumPy rules + "PD", # pandas rules + "N", # pep8-naming + "DOC", # pydoclint + "PLC", # pylint convention + "PLE", # pylint error + "PLW", # pylint warning + "RUF", # ruff specific rules +] + +ignore = [ + "D100", # Missing docstring in public module + "D104", # Missing docstring in public package + "D105", # Missing docstring in magic method + "D106", # Missing docstring in public nested class + "D107", # Missing docstring in __init__ + "D401", # First line should be in imperative mood + "D413", # Missing blank line after last section + "D203", # 1 blank line required before class docstring + "SIM108", # Use ternary operators + "SIM102", # Use a single if statement instead of nested if statements + "SIM114", # Combine `if` branches + "DOC501", # Raised exception {id} missing from docstring + "DOC502", # Raised exception is not explicitly raised: {id} + "RUF022", # `__all__` is not sorted + "RUF005", # Consider expression instead of concatenation + "RUF069", # Unreliable floating point equality comparison + "PLR0904", # Too many public methods ({methods} > {max_methods}) + "PLR0911", # Too many return statements ({returns} > {max_returns}) + "PLR0912", # Too many branches ({branches} > {max_branches}) + "PLR0913", # Too many arguments ({arguments} > {max_arguments}) + "PLR0914", # Too many local variables ({variables} > {max_variables}) + "PLR0915", # Too many statements ({statements} > {max_statements}) + "PLR0916", # Too many Boolean expressions ({expressions} > {max_expressions}) + "PLR1702", # Too many nested blocks ({blocks} > {max_blocks}), + "COM812", # Missing trailing comma +] + +[tool.ruff.lint.flake8-annotations] +allow-star-arg-any = true +mypy-init-return = true +suppress-dummy-args = true +suppress-none-returning = true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 7650ea0..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,70 +0,0 @@ - -[tool.ruff] -line-length = 120 - -[tool.ruff.format] -docstring-code-format = true -docstring-code-line-length = 100 - -[tool.ruff.lint] -preview = true -select = [ - "F", # Pyflakes - "E", # pycodestyle - "I", # isort - "D", # pydocstyle - "UP", # pyupgrade - "ANN", # flake8-annotations - "B", # bugbear - "A", # flake8-builtins - "COM", # flake8-commas - "FA", # flake8-future-annotations - "PT", # flake8-pytest-style - "Q", # flake8-quotes - "RET", # flake8-return - "SIM", # flake8-simplify - "PTH", # flake8-use-pathlib - "NPY", # NumPy rules - "PD", # pandas rules - "N", # pep8-naming - "DOC", # pydoclint - "PLC", # pylint convention - "PLE", # pylint error - "PLW", # pylint warning - "RUF", # ruff specific rules -] - -ignore = [ - "D100", # Missing docstring in public module - "D104", # Missing docstring in public package - "D105", # Missing docstring in magic method - "D106", # Missing docstring in public nested class - "D107", # Missing docstring in __init__ - "D401", # First line should be in imperative mood - "D413", # Missing blank line after last section - "D203", # 1 blank line required before class docstring - "SIM108", # Use ternary operators - "SIM102", # Use a single if statement instead of nested if statements - "SIM114", # Combine `if` branches - "DOC501", # Raised exception {id} missing from docstring - "DOC502", # Raised exception is not explicitly raised: {id} - "RUF022", # `__all__` is not sorted - "RUF005", # Consider expression instead of concatenation - "RUF069", # Unreliable floating point equality comparison - "PD901", # Avoid using the generic variable name df for dataframes - "PLR0904", # Too many public methods ({methods} > {max_methods}) - "PLR0911", # Too many return statements ({returns} > {max_returns}) - "PLR0912", # Too many branches ({branches} > {max_branches}) - "PLR0913", # Too many arguments ({arguments} > {max_arguments}) - "PLR0914", # Too many local variables ({variables} > {max_variables}) - "PLR0915", # Too many statements ({statements} > {max_statements}) - "PLR0916", # Too many Boolean expressions ({expressions} > {max_expressions}) - "PLR1702", # Too many nested blocks ({blocks} > {max_blocks}), - "COM812", # Missing trailing comma -] - -[tool.ruff.lint.flake8-annotations] -allow-star-arg-any = true -mypy-init-return = true -suppress-dummy-args = true -suppress-none-returning = true \ No newline at end of file From 55b88cc52d8254d6c6e38967761fe39ee3c80059 Mon Sep 17 00:00:00 2001 From: EMontandon Date: Fri, 27 Mar 2026 09:49:23 +0100 Subject: [PATCH 5/8] fix(): ruff imports --- d2d_development/tests/test_data_point.py | 2 +- d2d_development/tests/test_extract.py | 2 +- d2d_development/tests/test_push.py | 2 +- d2d_development/tests/test_utils.py | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/d2d_development/tests/test_data_point.py b/d2d_development/tests/test_data_point.py index 0c28766..824ba02 100644 --- a/d2d_development/tests/test_data_point.py +++ b/d2d_development/tests/test_data_point.py @@ -1,6 +1,6 @@ import polars as pl -from d2d_development.data_models import DataPointModel +from d2d_development.data_models import DataPointModel from tests.mock_dhis2_get import MockDHIS2Client diff --git a/d2d_development/tests/test_extract.py b/d2d_development/tests/test_extract.py index 91521b9..edbbe9d 100644 --- a/d2d_development/tests/test_extract.py +++ b/d2d_development/tests/test_extract.py @@ -2,8 +2,8 @@ from unittest.mock import patch import polars as pl -from d2d_development.extract import DHIS2Extractor +from d2d_development.extract import DHIS2Extractor from tests.mock_dhis2_get import MockDHIS2Client diff --git a/d2d_development/tests/test_push.py b/d2d_development/tests/test_push.py index 536ca5f..cf1fec3 100644 --- a/d2d_development/tests/test_push.py +++ b/d2d_development/tests/test_push.py @@ -2,9 +2,9 @@ import polars as pl import pytest + from d2d_development.extract import DHIS2Extractor from d2d_development.push import DHIS2Pusher, PusherError - from tests.mock_dhis2_get import MockDHIS2Client from tests.mock_dhis2_post import ( MOCK_DHIS2_ERROR_409_RESPONSE_AOC, diff --git a/d2d_development/tests/test_utils.py b/d2d_development/tests/test_utils.py index 4a7cc35..37356c3 100644 --- a/d2d_development/tests/test_utils.py +++ b/d2d_development/tests/test_utils.py @@ -5,6 +5,7 @@ import pandas as pd import polars as pl import pytest + from d2d_development.exceptions import ExtractorError from d2d_development.utils import log_message, save_to_parquet From 630aae4f615d939b7df7422e5a295ad6eb76c932 Mon Sep 17 00:00:00 2001 From: EMontandon Date: Fri, 27 Mar 2026 12:00:14 +0100 Subject: [PATCH 6/8] feature(DHIS2Push): Summary delete & ignore + test + docs --- d2d_development/d2d_development/push.py | 67 +++++++++++++++++++---- d2d_development/tests/test_push.py | 71 +++++++++++++++++-------- 2 files changed, 107 insertions(+), 31 deletions(-) diff --git a/d2d_development/d2d_development/push.py b/d2d_development/d2d_development/push.py index c6286b2..7e335a1 100644 --- a/d2d_development/d2d_development/push.py +++ b/d2d_development/d2d_development/push.py @@ -23,6 +23,7 @@ def __init__( logging_interval: int = 50000, logger: logging.Logger | None = None, ): + """Initialize the DHIS2Pusher.""" self.dhis2_client = dhis2_client if import_strategy not in {"CREATE", "UPDATE", "CREATE_AND_UPDATE"}: @@ -42,7 +43,19 @@ def push_data( self, df_data: pd.DataFrame | pl.DataFrame, ) -> None: - """Push formatted data to DHIS2.""" + """Push formatted data to DHIS2. + + Parameters + ---------- + df_data : pd.DataFrame or pl.DataFrame + DataFrame containing the data points to be pushed. Must include the following columns: + 'dx', 'period', 'orgUnit', 'categoryOptionCombo', 'attributeOptionCombo', and 'value'. + + Raises + ------ + PusherError + If the input data is not a DataFrame or if mandatory fields are missing. + """ self._reset_summary() self._set_summary_import_options() @@ -101,6 +114,7 @@ def _classify_data_points(self, data_points: pl.DataFrame) -> tuple[pl.DataFrame return valid, to_delete, not_valid def _set_summary_import_options(self): + """Set the import options in the summary dictionary based on the current configuration.""" self.summary["import_options"] = { "importStrategy": self.import_strategy, "dryRun": self.dry_run, @@ -109,7 +123,13 @@ def _set_summary_import_options(self): } def _push_valid(self, data_points_valid: pl.DataFrame) -> None: - """Push valid values to DHIS2.""" + """Push valid values to DHIS2. + + Parameters + ---------- + data_points_valid: pl.DataFrame + DataFrame containing valid data points to be pushed to DHIS2. + """ if len(data_points_valid) == 0: self._log_message("No data to push.") return @@ -119,6 +139,7 @@ def _push_valid(self, data_points_valid: pl.DataFrame) -> None: self._log_message(f"Data points push summary: {self.summary['import_counts']}") def _push_to_delete(self, data_points_to_delete: pl.DataFrame) -> None: + """Push data points with NA values to DHIS2 to delete them.""" if data_points_to_delete.height == 0: return @@ -128,11 +149,19 @@ def _push_to_delete(self, data_points_to_delete: pl.DataFrame) -> None: self._log_message(f"Data points delete summary: {self.summary['import_counts']}") def _log_ignored_or_na(self, data_points: pl.DataFrame, is_na: bool = False): - """Logs ignored or NA data points.""" + """Logs ignored or NA data points. + + Parameters + ---------- + data_points: pl.DataFrame + DataFrame containing the data points to be logged as ignored or NA. + is_na: bool + Flag whether the data points are NA (to be deleted) or ignored. Defaults to False (ignored). + """ data_points_list = data_points.to_dicts() if len(data_points_list) > 0: self._log_message( - f"{len(data_points_list)} data points will be {'set to NA' if is_na else 'ignored'}. " + f"{len(data_points_list)} data points will be {'set to NA' if is_na else 'ignored'}. " "Please check the last execution report for details.", level="warning", ) @@ -141,6 +170,9 @@ def _log_ignored_or_na(self, data_points: pl.DataFrame, is_na: bool = False): self._log_message( f"{i}. Data point {'NA' if is_na else 'ignored'}: {row_str}", log_current_run=False, level="warning" ) + if is_na: + self.summary["delete_data_points"].append(ignored) + self.summary["ignored_data_points"].append(ignored) def _log_message(self, message: str, level: str = "info", log_current_run: bool = True, error_details: str = ""): """Log a message using the configured logging function.""" @@ -174,7 +206,7 @@ def _serialize_data_points(self, data_points: pl.DataFrame) -> list[dict]: def _log_summary_errors(self): """Logs all the errors in the summary dictionary using the configured logging.""" - errors = self.summary.get("ERRORS", []) + errors = self.summary.get("import_errors", []) if not errors: self._log_message("No errors found in the summary.") else: @@ -204,7 +236,13 @@ def _push_data_points( self, data_point_list: list[dict], ) -> None: - """dry_run: Set to true to get an import summary without actually importing data (DHIS2).""" + """Push data points to DHIS2 in chunks, handling responses and logging progress. + + Parameters + ---------- + data_point_list: list[dict] + A list of dictionaries, each representing a data point formatted for DHIS2. + """ total_data_points = len(data_point_list) processed_points = 0 last_logged_at = 0 @@ -230,7 +268,7 @@ def _push_data_points( self._update_import_counts(response) else: # No response JSON, at least log the request error msg - self.summary["ERRORS"].extend( + self.summary["import_errors"].extend( [{"chunk": chunk_id, "period": chunk[0].get("period", "-"), "exception": str(e)}] ) self._extract_conflicts(response) @@ -265,14 +303,17 @@ def _raise_server_errors(self, r: requests.Response) -> None: "server_error_code": f"{r.status_code}", "message": f"Server error: {message}", } - self.summary["ERRORS"].append(error_info) + self.summary["import_errors"].append(error_info) raise PusherError(f"Server error: {message}") from None def _reset_summary(self) -> None: + """Reset the summary dictionary to its initial state before starting a new push operation.""" self.summary = { "import_counts": {"imported": 0, "updated": 0, "ignored": 0, "deleted": 0}, "import_options": {}, - "ERRORS": [], + "import_errors": [], + "ignored_data_points": [], + "delete_data_points": [], } def _split_list(self, src_list: list, length: int): @@ -285,6 +326,11 @@ def _split_list(self, src_list: list, length: int): yield src_list[i : i + length] def _safe_json(self, r: requests.Response) -> dict | None: + """Safely parse the JSON response from a requests.Response object. + + Returns: + dict: The parsed JSON response if successful, or None if parsing fails or if the response is None. + """ if r is None: return None @@ -294,6 +340,7 @@ def _safe_json(self, r: requests.Response) -> dict | None: return None def _update_import_counts(self, response: dict) -> None: + """Update the import counts in the summary dictionary based on the response from DHIS2.""" if not response: return if "importCount" in response: @@ -327,4 +374,4 @@ def _extract_conflicts(self, response: dict) -> None: all_errors = conflicts + error_reports if all_errors: - self.summary.setdefault("ERRORS", []).extend(all_errors) + self.summary.setdefault("import_errors", []).extend(all_errors) diff --git a/d2d_development/tests/test_push.py b/d2d_development/tests/test_push.py index cf1fec3..5f08ee3 100644 --- a/d2d_development/tests/test_push.py +++ b/d2d_development/tests/test_push.py @@ -130,10 +130,39 @@ def test_push_log_invalid_data_points(): for idx, call in enumerate(mock_log_message.call_args_list): if idx == 0: log_message = call.args[0] - assert "4 data points will be ignored" in log_message, f"Unexpected log message: {log_message}" + assert "4 data points will be ignored" in log_message, f"Unexpected log message: {log_message}" else: log_message = call.args[0] assert f"Data point ignored: dx=INVALID{idx}" in log_message, f"Unexpected log message: {log_message}" + # Extra check for number of ignored data points in summary + assert "ignored_data_points" in pusher.summary, "summary should contain 'ignored_data_points' key" + assert len(pusher.summary["ignored_data_points"]) == 4, "Expected 4 ignored data points in summary" + + +def test_push_log_delete_data_points(): + """Test the logging of invalid data points.""" + data_points = ( + DHIS2Extractor(dhis2_client=MockDHIS2Client()) + .data_elements._retrieve_data(data_elements=[], org_units=[], period="202501") + .slice(3, 1) # Select invalid data points (rows 4 to 7) for testing + ) + print(data_points) + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + _, to_delete, _ = pusher._classify_data_points(data_points) + + with patch.object(pusher, "_log_message") as mock_log_message: + pusher._log_ignored_or_na(to_delete, is_na=True) + assert mock_log_message.call_count == 2, "Expected a log message for each invalid data point." + for idx, call in enumerate(mock_log_message.call_args_list): + if idx == 0: + log_message = call.args[0] + assert "1 data points will be set to NA" in log_message, f"Unexpected log message: {log_message}" + else: + log_message = call.args[0] + assert "Data point NA: dx=DELETE1" in log_message, f"Unexpected log message: {log_message}" + # Extra check for number of ignored data points in summary + assert "delete_data_points" in pusher.summary, "summary should contain 'delete_data_points' key" + assert len(pusher.summary["delete_data_points"]) == 1, "Expected 4 ignored data points in summary" def test_push_data_point(): @@ -176,9 +205,9 @@ def test_push_data_points_connection_error(): with pytest.raises(PusherError, match=r"Server error: Service temporarily unavailable"): pusher._push_data_points([{"dummy_datapoint": "1"}]) # After the exception, check the summary - assert len(pusher.summary["ERRORS"]) == 1 - assert pusher.summary["ERRORS"][0]["message"] == "Server error: Service temporarily unavailable" - assert pusher.summary["ERRORS"][0]["server_error_code"] == "503" + assert len(pusher.summary["import_errors"]) == 1 + assert pusher.summary["import_errors"][0]["message"] == "Server error: Service temporarily unavailable" + assert pusher.summary["import_errors"][0]["server_error_code"] == "503" def test_push_data_points_data_element_error(): @@ -225,9 +254,9 @@ def test_push_data_points_data_element_error(): assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 assert pusher.summary["import_counts"]["deleted"] == 0 - assert len(pusher.summary["ERRORS"]) == 2 - assert pusher.summary["ERRORS"][0]["object"] == "INVALID_1" - assert pusher.summary["ERRORS"][1]["object"] == "INVALID_2" + assert len(pusher.summary["import_errors"]) == 2 + assert pusher.summary["import_errors"][0]["object"] == "INVALID_1" + assert pusher.summary["import_errors"][1]["object"] == "INVALID_2" def test_push_data_points_org_unit_error(): @@ -274,9 +303,9 @@ def test_push_data_points_org_unit_error(): assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 assert pusher.summary["import_counts"]["deleted"] == 0 - assert len(pusher.summary["ERRORS"]) == 2 - assert pusher.summary["ERRORS"][0]["object"] == "INVALID_1_OU" - assert pusher.summary["ERRORS"][1]["object"] == "INVALID_2_OU" + assert len(pusher.summary["import_errors"]) == 2 + assert pusher.summary["import_errors"][0]["object"] == "INVALID_1_OU" + assert pusher.summary["import_errors"][1]["object"] == "INVALID_2_OU" def test_push_data_points_period_error(): @@ -323,9 +352,9 @@ def test_push_data_points_period_error(): assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 assert pusher.summary["import_counts"]["deleted"] == 0 - assert len(pusher.summary["ERRORS"]) == 2 - assert pusher.summary["ERRORS"][0]["object"] == "INVALID_PERIOD_1" - assert pusher.summary["ERRORS"][1]["object"] == "INVALID_PERIOD_2" + assert len(pusher.summary["import_errors"]) == 2 + assert pusher.summary["import_errors"][0]["object"] == "INVALID_PERIOD_1" + assert pusher.summary["import_errors"][1]["object"] == "INVALID_PERIOD_2" def test_push_data_points_coc_error(): @@ -372,9 +401,9 @@ def test_push_data_points_coc_error(): assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 assert pusher.summary["import_counts"]["deleted"] == 0 - assert len(pusher.summary["ERRORS"]) == 2 - assert pusher.summary["ERRORS"][0]["object"] == "INVALID_COC_1" - assert pusher.summary["ERRORS"][1]["object"] == "INVALID_COC_2" + assert len(pusher.summary["import_errors"]) == 2 + assert pusher.summary["import_errors"][0]["object"] == "INVALID_COC_1" + assert pusher.summary["import_errors"][1]["object"] == "INVALID_COC_2" def test_push_data_points_aoc_error(): @@ -421,9 +450,9 @@ def test_push_data_points_aoc_error(): assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 2 assert pusher.summary["import_counts"]["deleted"] == 0 - assert len(pusher.summary["ERRORS"]) == 2 - assert pusher.summary["ERRORS"][0]["object"] == "INVALID_AOC_1" - assert pusher.summary["ERRORS"][1]["object"] == "INVALID_AOC_2" + assert len(pusher.summary["import_errors"]) == 2 + assert pusher.summary["import_errors"][0]["object"] == "INVALID_AOC_1" + assert pusher.summary["import_errors"][1]["object"] == "INVALID_AOC_2" def test_push_data_points_value_format_error(): @@ -470,5 +499,5 @@ def test_push_data_points_value_format_error(): assert pusher.summary["import_counts"]["updated"] == 0 assert pusher.summary["import_counts"]["ignored"] == 1 assert pusher.summary["import_counts"]["deleted"] == 0 - assert len(pusher.summary["ERRORS"]) == 1 - assert pusher.summary["ERRORS"][0]["object"] == "VALID2" + assert len(pusher.summary["import_errors"]) == 1 + assert pusher.summary["import_errors"][0]["object"] == "VALID2" From d0e3b919afe9aa796695fa07ecc0bc6494a2c137 Mon Sep 17 00:00:00 2001 From: EMontandon Date: Fri, 27 Mar 2026 12:00:52 +0100 Subject: [PATCH 7/8] chores(DHIS2Extract): docs + Update readme --- d2d_development/README.md | 125 +++++++++++++++++++-- d2d_development/d2d_development/extract.py | 86 +++++++++++++- 2 files changed, 197 insertions(+), 14 deletions(-) diff --git a/d2d_development/README.md b/d2d_development/README.md index 962394b..73fe283 100644 --- a/d2d_development/README.md +++ b/d2d_development/README.md @@ -17,7 +17,6 @@ pip install git+https://github.com/BLSQ/openhexa-ds-developments.git#subdirector **Description:** Main class to extract data from DHIS2. It provides unified handlers for extracting data elements, indicators, and reporting rates, saving them to disk in a standardized format. - **Configuration Parameters:** When initializing `DHIS2Extractor`, you can configure the following parameters: @@ -31,6 +30,49 @@ Example: extractor = DHIS2Extractor(dhis2_client, download_mode="DOWNLOAD_NEW", return_existing_file=True) ``` +**Parameters for `download_period` (DataElementsExtractor, IndicatorsExtractor, ReportingRatesExtractor):** + +- **data_elements / indicators / reporting_rates** (`list[str]`): + A list of DHIS2 UIDs to extract. + - For `data_elements.download_period`, use `data_elements=["de1", "de2"]`. + - For `indicators.download_period`, use `indicators=["ind1", ...]`. + - For `reporting_rates.download_period`, use `reporting_rates=["rr1", ...]`. + +- **org_units** (`list[str]`): + List of DHIS2 organisation unit UIDs to extract data for (e.g., `["ou1", "ou2"]`). + +- **period** (`str`): + The DHIS2 period to extract data for (e.g., `"202401"` for January 2024). Must be a valid DHIS2 period string. + +- **output_dir** (`Path`): + The directory where the extracted data file will be saved. The file will be named `data_.parquet` by default unless you specify a custom filename. + +- **filename** (`str | None`, optional): + Custom filename for the output file. If not provided, the default is `data_.parquet`. Using the default is recommended when extracting multiple periods. + +- **kwargs** (`dict`, optional): + Additional keyword arguments for advanced extraction options. + - For data elements: `last_updated` (not yet implemented). + - For indicators: `include_cocs` (bool, whether to include category option combo, use only together with data element ids). + - For reporting rates: currently no extra options. + +**Returns:** +- The path to the saved Parquet file (`Path`), or `None` if no data was extracted or the file already exists and `return_existing_file` is `False`. + +**Output Format:** +The extraction methods always save the data in a table with a fixed column structure. Each extraction creates a separate .parquet file, where each row represents a data point and the columns are always: + +- **dx**: Data element, indicator, or reporting rate UID +- **period**: Period (e.g., `"202401"`) +- **orgUnit**: Organisation unit UID +- **categoryOptionCombo**: Category option combo UID +- **attributeOptionCombo**: Attribute option combo UID +- **rateMetric**: Rate metric (for reporting rates) +- **domainType**: Data domain (e.g., `"AGGREGATED"`) +- **value**: The value for the data point + +The file path to the saved Parquet file is returned by the extraction method. You can load the output using pandas, polars, or any tool that supports Parquet files. + **Usage Example:** ```python from d2d_development.extract import DHIS2Extractor @@ -49,6 +91,7 @@ for period in ["202401", "202402", "202403"]: period=period, output_dir=Path("/output") ) + # Extract one period of indicators extractor.indicators.download_period( indicators=["ind1"], @@ -56,6 +99,7 @@ extractor.indicators.download_period( period="202401", output_dir=Path("/tmp") ) + # Extract one period of reporting rates extractor.reporting_rates.download_period( reporting_rates=["rr1"], @@ -63,18 +107,44 @@ extractor.reporting_rates.download_period( period="202401", output_dir=Path("/tmp") ) + +# Example load the output file +import polars as pl +df = pl.read_parquet(Path(/tmp) / f"data_{period}.parquet") # Default naming +print(df.head()) ``` +**Note:** +- The same pattern applies for `extractor.indicators.download_period` and `extractor.reporting_rates.download_period`, just change the first argument name accordingly. +- All extracted files are saved in Parquet format by default. + + +--- + + ### DHIS2Pusher **Description:** Main class to handle pushing data to DHIS2. It validates and pushes formatted data (pandas or polars DataFrame) to a DHIS2 instance. +**Input Data Format for `DHIS2Pusher`** + +The `push_data` method expects a pandas or polars DataFrame with the following columns (all required): + +- **dx**: Data element, indicator, or reporting rate UID +- **period**: Period (e.g., `"202401"`) +- **orgUnit**: Organisation unit UID +- **categoryOptionCombo**: Category option combo UID +- **attributeOptionCombo**: Attribute option combo UID +- **value**: The value to be pushed (numeric or string, depending on DHIS2 configuration) + +If any of these columns are missing, or if the input is not a pandas or polars DataFrame, a `PusherError` will be raised. + **Configuration Parameters:** When initializing `DHIS2Pusher`, you can configure the following parameters: - `dhis2_client` (required): The DHIS2 client instance. -- `import_strategy`: Strategy flag passed to the DHIS2 API for data import. Accepts "CREATE", "UPDATE", or "CREATE_AND_UPDATE" (default: "CREATE_AND_UPDATE"). This only controls how the DHIS2 server processes the data; it does not affect client-side logic. +- `import_strategy`: Strategy flag passed to the DHIS2 API for data import. Accepts "CREATE", "UPDATE", or "CREATE_AND_UPDATE" (default: "CREATE_AND_UPDATE"). **NOTE:** This only controls how the DHIS2 server processes the data; it does not affect client-side logic. - `dry_run`: If `True`, simulates the push without making changes on the server (default: `True`). - `max_post`: Maximum number of data points per POST request (default: `500`). - `logging_interval`: Log progress every N data points (default: `50000`). @@ -88,13 +158,6 @@ from openhexa.toolbox.dhis2 import DHIS2 import polars as pl dhis2_client = DHIS2(workspace.get_connection("dhis2-connection")) -pusher = DHIS2Pusher( - dhis2_client, - import_strategy="CREATE_AND_UPDATE", # or "CREATE", "UPDATE" - dry_run=False, - max_post=1000, - logging_interval=10000, -) df = pl.DataFrame({ "dx": ["de1"], @@ -103,5 +166,49 @@ df = pl.DataFrame({ "categoryOptionCombo": ["coc"], "attributeOptionCombo": ["aoc"], "value": [123]}) + +pusher = DHIS2Pusher( + dhis2_client, + import_strategy="CREATE_AND_UPDATE", # or "CREATE", "UPDATE" + dry_run=False, + max_post=1000, + logging_interval=10000, +) + pusher.push_data(df) ``` + +**Accessing Push Summary Information** + +After calling `push_data`, the `DHIS2Pusher` instance provides detailed results of the push operation in its `summary` attribute. This dictionary contains: + +- `import_counts`: Number of data points imported, updated, ignored, or deleted (dict with keys: `imported`, `updated`, `ignored`, `deleted`). +- `import_options`: The options used for the import (strategy, dry run, etc). +- `import_errors`: List of errors, conflicts, or error reports returned by DHIS2 or encountered during the push. +- `ignored_data_points`: List of data points that were ignored due to missing or invalid fields. +- `delete_data_points`: List of data points that were marked for deletion (value is NA/null). + +**Example:** +```python +pusher.push_data(df) +print(pusher.summary) +# Example output: +# { +# 'import_counts': {'imported': 1, 'updated': 0, 'ignored': 0, 'deleted': 0}, +# 'import_options': {'importStrategy': 'CREATE_AND_UPDATE', 'dryRun': False, ...}, +# 'import_errors': [], +# 'ignored_data_points': [], +# 'delete_data_points': [] +# } +``` + +You can use these fields to programmatically inspect the results of your push, handle errors, or log/report the outcome. For example, to check if any data points were ignored: + +```python +if pusher.summary["ignored_data_points"]: + print(f"Ignored {len(pusher.summary['ignored_data_points'])} data points:") + for dp in pusher.summary["ignored_data_points"]: + print(dp) +``` + +--- \ No newline at end of file diff --git a/d2d_development/d2d_development/extract.py b/d2d_development/d2d_development/extract.py index 2937377..407446c 100644 --- a/d2d_development/d2d_development/extract.py +++ b/d2d_development/d2d_development/extract.py @@ -8,11 +8,6 @@ from .exceptions import ExtractorError from .utils import log_message, save_to_parquet -# TODO: -# 1) Refactor the extractors to (Following DHIS2 client endpoints): -# -DataValueSetsExtractor (DE) -# -AnalyticsExtractor (DE, indicators, ReportingRates) - class DataElementsExtractor: """Handles downloading and formatting of data elements from DHIS2.""" @@ -74,6 +69,23 @@ def download_period( raise ExtractorError(f"Extract data elements download error : {e}") from e def _retrieve_data(self, data_elements: list[str], org_units: list[str], period: str, **kwargs) -> pl.DataFrame: # noqa: ANN003 + """Retrieve data from DHIS2 for the specified data elements, organization units, and period. + + Parameters + ---------- + data_elements : list[str] + List of DHIS2 data element UIDs to extract. + org_units : list[str] + List of DHIS2 organization unit UIDs to extract data for. + period : str + DHIS2 period (valid format) to extract data for. + kwargs : dict + Additional keyword arguments for data retrieval, only `last_updated` available but not impemented yet. + + Returns + ------- + pl.DataFrame A DataFrame containing the retrieved data, formatted according to DHIS2 naming standards. + """ if not self.extractor._valid_dhis2_period_format(period): raise ExtractorError(f"Invalid DHIS2 period format: {period}") last_updated = kwargs.get("last_updated") @@ -96,6 +108,7 @@ class IndicatorsExtractor: """Handles downloading and formatting of indicators from DHIS2.""" def __init__(self, extractor: "DHIS2Extractor"): + """Initialize the IndicatorsExtractor with a reference to the main DHIS2Extractor.""" self.extractor = extractor def download_period( @@ -155,6 +168,24 @@ def download_period( raise ExtractorError(f"Extract indicators download error : {e}") from e def _retrieve_data(self, indicators: list[str], org_units: list[str], period: str, **kwargs) -> pl.DataFrame: # noqa: ANN003 + """Retrieve data from DHIS2 for the specified indicators, organization units, and period. + + Parameters + ---------- + indicators : list[str] + List of DHIS2 indicator UIDs to extract. + org_units : list[str] + List of DHIS2 organization unit UIDs to extract data for. + period : str + DHIS2 period (valid format) to extract data for. + kwargs : dict + Additional keyword arguments for data retrieval, only `include_cocs` currently implemented + to include category option combo mapping for data element ids passed to the DHIS2 client. + + Returns + ------- + pl.DataFrame A DataFrame containing the retrieved data, formatted according to DHIS2 naming standards. + """ if not self.extractor._valid_dhis2_period_format(period): raise ExtractorError(f"Invalid DHIS2 period format: {period}") @@ -184,6 +215,7 @@ class ReportingRatesExtractor: """Handles downloading and formatting of reporting rates from DHIS2.""" def __init__(self, extractor: "DHIS2Extractor"): + """Initialize the ReportingRatesExtractor with a reference to the main DHIS2Extractor.""" self.extractor = extractor def download_period( @@ -240,6 +272,23 @@ def download_period( raise ExtractorError(f"Extract reporting rates download error : {e}") from e def _retrieve_data(self, reporting_rates: list[str], org_units: list[str], period: str, **kwargs) -> pl.DataFrame: # noqa: ANN003 + """Retrieve data from DHIS2 for the specified reporting rates, organization units, and period. + + Parameters + ---------- + reporting_rates : list[str] + List of DHIS2 reporting rate UIDs to extract. + org_units : list[str] + List of DHIS2 organization unit UIDs to extract data for. + period : str + DHIS2 period (valid format) to extract data for. + kwargs : dict + Additional keyword arguments for data retrieval (not impemented). + + Returns + ------- + pl.DataFrame A DataFrame containing the retrieved data, formatted according to DHIS2 naming standards. + """ if not self.extractor._valid_dhis2_period_format(period): raise ExtractorError(f"Invalid DHIS2 period format: {period}") @@ -295,6 +344,7 @@ def __init__( return_existing_file: bool = False, logger: logging.Logger | None = None, ): + """Initialize the DHIS2Extractor with the given DHIS2 client and configuration.""" self.dhis2_client = dhis2_client if download_mode not in {"DOWNLOAD_REPLACE", "DOWNLOAD_NEW"}: raise ExtractorError("Invalid 'download_mode', use 'DOWNLOAD_REPLACE' or 'DOWNLOAD_NEW'.") @@ -317,6 +367,32 @@ def _handle_extract_for_period( filename: str | None = None, **kwargs, # noqa: ANN003 ) -> Path | None: + """Handles the extract process for a given period, including data retrieval, file saving, and logging. + + Parameters + ---------- + handler : DataElementsExtractor | IndicatorsExtractor | ReportingRatesExtractor + The specific handler to use for data retrieval. + data_products : list[str] + List of data product UIDs to extract (e.g., data elements, indicators, or reporting rates). + org_units : list[str] + List of DHIS2 organization unit UIDs to extract data for. + period : str + DHIS2 period (valid format) to extract data for. + output_dir : Path + Directory where extracted data files will be saved. + filename : str | None + Optional filename for the extracted data file. If None, a default name will be used. + kwargs : dict + Additional keyword arguments for data retrieval, such as `last_updated` for filtering data. + + Returns + ------- + Path | None + The path to the extracted data file, or None if no data was extracted or if + the file already exists and `return_existing_file` is False. + + """ output_dir.mkdir(parents=True, exist_ok=True) if filename: extract_fname = output_dir / filename From 460d9782e0abf0166b47cdbda3b5ec0e26c8199e Mon Sep 17 00:00:00 2001 From: EMontandon Date: Mon, 30 Mar 2026 10:37:43 +0200 Subject: [PATCH 8/8] feat(push): rejected datapoints + test --- d2d_development/d2d_development/push.py | 29 +++++++++++---- d2d_development/d2d_development/utils.py | 2 +- d2d_development/tests/test_push.py | 46 +++++++++++++++++++++++- 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/d2d_development/d2d_development/push.py b/d2d_development/d2d_development/push.py index 7e335a1..ff496da 100644 --- a/d2d_development/d2d_development/push.py +++ b/d2d_development/d2d_development/push.py @@ -259,7 +259,7 @@ def _push_data_points( self._update_import_counts(response) # Capture conflicts/errorReports if present - self._extract_conflicts(response) + self._extract_conflicts(response, chunk) except requests.exceptions.RequestException as e: self._raise_server_errors(r) # Stop the process if there's a server error @@ -271,7 +271,7 @@ def _push_data_points( self.summary["import_errors"].extend( [{"chunk": chunk_id, "period": chunk[0].get("period", "-"), "exception": str(e)}] ) - self._extract_conflicts(response) + self._extract_conflicts(response, chunk) processed_points += len(chunk) @@ -312,6 +312,7 @@ def _reset_summary(self) -> None: "import_counts": {"imported": 0, "updated": 0, "ignored": 0, "deleted": 0}, "import_options": {}, "import_errors": [], + "rejected_datapoints": [], "ignored_data_points": [], "delete_data_points": [], } @@ -352,26 +353,40 @@ def _update_import_counts(self, response: dict) -> None: for key in ["imported", "updated", "ignored", "deleted"]: self.summary["import_counts"][key] += import_counts.get(key, 0) - def _extract_conflicts(self, response: dict) -> None: + def _extract_conflicts(self, response: dict, chunk: list) -> None: """Extract all conflicts and errorReports from a DHIS2 API response. - Handles both top-level and nested 'response' nodes. Optionally updates the summary. + This method looks for 'conflicts' and 'errorReports' at both the top level and within a nested + 'response' object. It also extracts 'rejectedIndexes' from the nested 'response' to identify which data + points were rejected by DHIS2, and adds them to the summary under 'rejected_datapoints'. Parameters ---------- - response : dict - The JSON response from DHIS2 after an import. + response: dict + The JSON response from the DHIS2 API after attempting to push data points. + chunk: list + The list of data points that were included in the API request corresponding to the response. """ if not response: return + conflicts = response.get("conflicts", []) error_reports = response.get("errorReports", []) - # Check if nested under "response" nested = response.get("response", {}) conflicts += nested.get("conflicts", []) error_reports += nested.get("errorReports", []) + rejected_indexes = nested.get("rejectedIndexes", []) + all_errors = conflicts + error_reports if all_errors: self.summary.setdefault("import_errors", []).extend(all_errors) + + # Extract rejected datapoints + if rejected_indexes: + rejected_datapoints = [ + {"index": idx, "datapoint": chunk[idx]} for idx in rejected_indexes if 0 <= idx < len(chunk) + ] + if rejected_datapoints: + self.summary.setdefault("rejected_datapoints", []).extend(rejected_datapoints) diff --git a/d2d_development/d2d_development/utils.py b/d2d_development/d2d_development/utils.py index 55554fc..94e3f53 100644 --- a/d2d_development/d2d_development/utils.py +++ b/d2d_development/d2d_development/utils.py @@ -6,7 +6,7 @@ import polars as pl from openhexa.sdk import current_run -from d2d_development.exceptions import ExtractorError +from .exceptions import ExtractorError def log_message( diff --git a/d2d_development/tests/test_push.py b/d2d_development/tests/test_push.py index 5f08ee3..840e2ef 100644 --- a/d2d_development/tests/test_push.py +++ b/d2d_development/tests/test_push.py @@ -146,7 +146,6 @@ def test_push_log_delete_data_points(): .data_elements._retrieve_data(data_elements=[], org_units=[], period="202501") .slice(3, 1) # Select invalid data points (rows 4 to 7) for testing ) - print(data_points) pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) _, to_delete, _ = pusher._classify_data_points(data_points) @@ -501,3 +500,48 @@ def test_push_data_points_value_format_error(): assert pusher.summary["import_counts"]["deleted"] == 0 assert len(pusher.summary["import_errors"]) == 1 assert pusher.summary["import_errors"][0]["object"] == "VALID2" + + +def test_push_summary_rejected_points(): + """Test that rejected data points are correctly tracked in the summary.""" + pusher = DHIS2Pusher(dhis2_client=MockDHIS2Client()) + # NOTE: This fake input is just to pass validation and + # match the information manufactured in the response + invalid_dp_1 = { + "dataElement": "VALID2", + "period": "202501", + "orgUnit": "ORG002", + "categoryOptionCombo": "INVALID_AOC_1", + "attributeOptionCombo": "ATTR001", + "value": "1", + } + invalid_dp_2 = { + "dataElement": "VALID3", + "period": "202501", + "orgUnit": "ORG003", + "categoryOptionCombo": "INVALID_AOC_2", + "attributeOptionCombo": "ATTR001", + "value": "1", + } + invalid_data_points = [ + { + "dataElement": "VALID1", + "period": "202501", + "orgUnit": "ORG001", + "categoryOptionCombo": "CAT001", + "attributeOptionCombo": "ATTR001", + "value": "1", + }, + invalid_dp_1, + invalid_dp_2, + ] + + # MOCK_DHIS2_ERROR_409_RESPONSE_AOC was manually manufactured to simulate a Conflict from DHIS2. + with patch.object( + pusher.dhis2_client.api.session, + "post", + return_value=MockDHIS2Response(MOCK_DHIS2_ERROR_409_RESPONSE_AOC, status_code=409), + ): + pusher._push_data_points(invalid_data_points) # access private method for error handling testing + assert pusher.summary["rejected_datapoints"][0]["datapoint"] == invalid_dp_1 + assert pusher.summary["rejected_datapoints"][1]["datapoint"] == invalid_dp_2