Skip to content

Commit 9b5e4d0

Browse files
committed
validate schema only, drop redundant content checks
1 parent 1acceb8 commit 9b5e4d0

1 file changed

Lines changed: 13 additions & 25 deletions

File tree

tests/parsers/refseq/api/test_annotation_report.py

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import json
44
from pathlib import Path
5+
from unittest import result
56

67
import pytest
78
from cdm_data_loader_utils.parsers.refseq.api.annotation_report import (
@@ -714,44 +715,31 @@ def test_to_int(val, expected):
714715

715716
@pytest.mark.requires_spark
716717
def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None:
717-
"""Test the parsing of the annotation data."""
718+
"""
719+
Test that parse_annotation_data produces expected tables with correct schemas and non-empty output.
720+
"""
721+
718722
spark.sql(f"CREATE DATABASE IF NOT EXISTS {TEST_NS}")
719723
spark.catalog.setCurrentDatabase(TEST_NS)
720724

721-
# Load NCBI dataset from NCBI API
725+
# Load test JSON
722726
sample_api_response = test_data_dir / "refseq" / "annotation_report.json"
723727
dataset = json.load(sample_api_response.open())
724728

725-
# Run parse function
729+
# Run the parser
726730
parse_annotation_data(spark, [dataset], TEST_NS)
727731

728-
# Expected tables to validate from output
729-
expected_tables = [
730-
"contig",
731-
"contig_x_contigcollection",
732-
"contigcollection_x_feature",
733-
"contigcollection_x_protein",
734-
"feature",
735-
"feature_x_protein",
736-
"identifier",
737-
"name",
738-
]
732+
expected_tables = list(TABLE_NAME_MAP.keys())
739733

740734
for table_name in expected_tables:
741735
result_df = spark.table(f"{TEST_NS}.{table_name}")
742736
schema_key = TABLE_NAME_MAP[table_name]
743737

744-
# Construct expected_df just for schema comparison
745-
rows = [r.asDict() for r in result_df.collect()]
746-
expected_df = spark.createDataFrame(rows, schema=CDM_SCHEMA[schema_key])
747-
748-
# Assert schema match
738+
# Validate schema
749739
assertSchemaEqual(
750-
expected_df.schema,
751740
result_df.schema,
741+
CDM_SCHEMA[schema_key],
752742
)
753-
# Assert content match
754-
assertDataFrameEqual(
755-
expected_df,
756-
result_df,
757-
)
743+
744+
# Check table is not empty
745+
assert result_df.count() > 0, f"Table '{table_name}' is unexpectedly empty."

0 commit comments

Comments
 (0)