validate schema only, drop redundant content checks

alinakbase · alinakbase · commit 9b5e4d0e99f6 · 2026-01-29T16:37:16.000-08:00
diff --git a/tests/parsers/refseq/api/test_annotation_report.py b/tests/parsers/refseq/api/test_annotation_report.py
@@ -2,6 +2,7 @@
 
 import json
 from pathlib import Path
+from unittest import result
 
 import pytest
 from cdm_data_loader_utils.parsers.refseq.api.annotation_report import (
@@ -714,44 +715,31 @@ def test_to_int(val, expected):
 
 @pytest.mark.requires_spark
 def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None:
-    """Test the parsing of the annotation data."""
+    """
+    Test that parse_annotation_data produces expected tables with correct schemas and non-empty output.
+    """
+
     spark.sql(f"CREATE DATABASE IF NOT EXISTS {TEST_NS}")
     spark.catalog.setCurrentDatabase(TEST_NS)
 
-    # Load NCBI dataset from NCBI API
+    # Load test JSON
     sample_api_response = test_data_dir / "refseq" / "annotation_report.json"
     dataset = json.load(sample_api_response.open())
 
-    # Run parse function
+    # Run the parser
     parse_annotation_data(spark, [dataset], TEST_NS)
 
-    # Expected tables to validate from output
-    expected_tables = [
-        "contig",
-        "contig_x_contigcollection",
-        "contigcollection_x_feature",
-        "contigcollection_x_protein",
-        "feature",
-        "feature_x_protein",
-        "identifier",
-        "name",
-    ]
+    expected_tables = list(TABLE_NAME_MAP.keys())
 
     for table_name in expected_tables:
         result_df = spark.table(f"{TEST_NS}.{table_name}")
         schema_key = TABLE_NAME_MAP[table_name]
 
-        # Construct expected_df just for schema comparison
-        rows = [r.asDict() for r in result_df.collect()]
-        expected_df = spark.createDataFrame(rows, schema=CDM_SCHEMA[schema_key])
-
-        # Assert schema match
+        # Validate schema
         assertSchemaEqual(
-            expected_df.schema,
             result_df.schema,
+            CDM_SCHEMA[schema_key],
         )
-        # Assert content match
-        assertDataFrameEqual(
-            expected_df,
-            result_df,
-        )
+
+        # Check table is not empty
+        assert result_df.count() > 0, f"Table '{table_name}' is unexpectedly empty."