|
2 | 2 |
|
3 | 3 | import json |
4 | 4 | from pathlib import Path |
| 5 | +from unittest import result |
5 | 6 |
|
6 | 7 | import pytest |
7 | 8 | from cdm_data_loader_utils.parsers.refseq.api.annotation_report import ( |
@@ -714,44 +715,31 @@ def test_to_int(val, expected): |
714 | 715 |
|
715 | 716 | @pytest.mark.requires_spark |
716 | 717 | def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None: |
717 | | - """Test the parsing of the annotation data.""" |
| 718 | + """ |
| 719 | + Test that parse_annotation_data produces expected tables with correct schemas and non-empty output. |
| 720 | + """ |
| 721 | + |
718 | 722 | spark.sql(f"CREATE DATABASE IF NOT EXISTS {TEST_NS}") |
719 | 723 | spark.catalog.setCurrentDatabase(TEST_NS) |
720 | 724 |
|
721 | | - # Load NCBI dataset from NCBI API |
| 725 | + # Load test JSON |
722 | 726 | sample_api_response = test_data_dir / "refseq" / "annotation_report.json" |
723 | 727 | dataset = json.load(sample_api_response.open()) |
724 | 728 |
|
725 | | - # Run parse function |
| 729 | + # Run the parser |
726 | 730 | parse_annotation_data(spark, [dataset], TEST_NS) |
727 | 731 |
|
728 | | - # Expected tables to validate from output |
729 | | - expected_tables = [ |
730 | | - "contig", |
731 | | - "contig_x_contigcollection", |
732 | | - "contigcollection_x_feature", |
733 | | - "contigcollection_x_protein", |
734 | | - "feature", |
735 | | - "feature_x_protein", |
736 | | - "identifier", |
737 | | - "name", |
738 | | - ] |
| 732 | + expected_tables = list(TABLE_NAME_MAP.keys()) |
739 | 733 |
|
740 | 734 | for table_name in expected_tables: |
741 | 735 | result_df = spark.table(f"{TEST_NS}.{table_name}") |
742 | 736 | schema_key = TABLE_NAME_MAP[table_name] |
743 | 737 |
|
744 | | - # Construct expected_df just for schema comparison |
745 | | - rows = [r.asDict() for r in result_df.collect()] |
746 | | - expected_df = spark.createDataFrame(rows, schema=CDM_SCHEMA[schema_key]) |
747 | | - |
748 | | - # Assert schema match |
| 738 | + # Validate schema |
749 | 739 | assertSchemaEqual( |
750 | | - expected_df.schema, |
751 | 740 | result_df.schema, |
| 741 | + CDM_SCHEMA[schema_key], |
752 | 742 | ) |
753 | | - # Assert content match |
754 | | - assertDataFrameEqual( |
755 | | - expected_df, |
756 | | - result_df, |
757 | | - ) |
| 743 | + |
| 744 | + # Check table is not empty |
| 745 | + assert result_df.count() > 0, f"Table '{table_name}' is unexpectedly empty." |
0 commit comments