test

alinakbase · alinakbase · commit 16aa4cf41fc4 · 2026-01-27T15:24:31.000-08:00
diff --git a/tests/validation/test_dataframe_validator.py b/tests/validation/test_dataframe_validator.py
@@ -1,11 +1,19 @@
-"""Tests for parser error handling, schema compliance, and so on."""
+"""
+
+Tests for DataFrameValidator behavior:
+- empty dataframe handling
+- mocked validation flow
+- integration validation on real RefSeq CDM outputs
+
+"""
 
 from typing import Any
 from unittest.mock import MagicMock
 
 import pytest
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import StructField, StructType
+from pyspark.sql.functions import col, when, lit
 
 from cdm_data_loader_utils.audit.schema import METRICS, REJECTS, ROW_ERRORS
 from cdm_data_loader_utils.core.constants import INVALID_DATA_FIELD_NAME
@@ -14,6 +22,9 @@
 from tests.audit.conftest import create_table
 
 
+# ------------------------------------------------------------------------------
+# Unit tests
+# ------------------------------------------------------------------------------
 @pytest.mark.requires_spark
 def test_validate_dataframe_empty_df(pipeline_run: PipelineRun, empty_df: DataFrame) -> None:
     """Assert that an empty dataframe does not perform any validation."""
@@ -82,34 +93,36 @@ def test_validate_dataframe_no_validation(  # noqa: PLR0913
     assert rejects.count() == output.records_invalid
 
 
+# ------------------------------------------------------------------------------
+# Integration-style test (real RefSeq CDM output)
+# ------------------------------------------------------------------------------
+
+
 @pytest.mark.requires_spark
-def test_validate_refseq_cdm_identifiers(
+def test_validate_refseq_cdm(
     spark: SparkSession,
     pipeline_run: PipelineRun,
 ) -> None:
-    """
-    Validate CDM identifiers produced by annotation_parse.py
-    """
-
-    # Prepare the metrics / rejects form scratch
-    for t in [METRICS, REJECTS]:
+    # Prepare audit tables from scratch
+    for t in (METRICS, REJECTS):
         create_table(spark, t, add_default_data=False)
 
-    # read annotation_parse.py output
+    # Load real pipeline output
     df = spark.table(f"{pipeline_run.namespace}.cdm_identifiers")
 
-    # sanity check
+    # Sanity check: pipeline actually produced data
     assert df.count() > 0
+    assert "identifier" in df.columns
 
-    # identifier cannot be null
+    # Simple validation rule: identifier cannot be null
     def validation_fn(df: DataFrame) -> DataFrame:
-        from pyspark.sql.functions import when, col
-
-        return df.withColumn(INVALID_DATA_FIELD_NAME, when(col("identifier").isNull(), "identifier is null"))
+        return df.withColumn(
+            INVALID_DATA_FIELD_NAME,
+            when(col("identifier").isNull(), lit("identifier is null")),
+        )
 
     validator = Validator(validation_fn, {})
 
-    # DataFrameValidator
     dfv = DataFrameValidator(spark)
     output = dfv.validate_dataframe(
         data_to_validate=df,
@@ -119,11 +132,14 @@ def validation_fn(df: DataFrame) -> DataFrame:
         invalid_col=INVALID_DATA_FIELD_NAME,
     )
 
-    # assertions
+    # Records accounting
     assert output.records_read == df.count()
     assert output.records_valid + output.records_invalid == output.records_read
 
-    # validate rejects / metrics form scratch
+    # valid_df must not contain invalid rows
+    assert output.valid_df.filter(col(INVALID_DATA_FIELD_NAME).isNotNull()).count() == 0
+
+    # Audit tables written
     metrics = spark.table(f"{pipeline_run.namespace}.{METRICS}")
     rejects = spark.table(f"{pipeline_run.namespace}.{REJECTS}")