First draft of integration test

ialarmedalien · ialarmedalien · commit eab27c9e7f93 · 2026-01-27T17:12:54.000-08:00
diff --git a/src/cdm_data_loader_utils/parsers/annotation_parse.py b/src/cdm_data_loader_utils/parsers/annotation_parse.py
@@ -55,20 +55,6 @@ def init_spark_and_db(app_name: str, database: str) -> SparkSession:
     return spark
 
 
-# ---------------------------------------------------------------------
-# CDM TABLE SCHEMAS
-# ---------------------------------------------------------------------
-# Using centralized schemas
-IDENTIFIER_SCHEMA = cdm_schemas["Identifier"]
-NAME_SCHEMA = cdm_schemas["Name"]
-FEATURE_SCHEMA = cdm_schemas["Feature"]
-CONTIG_COLLECTION_X_FEATURE_SCHEMA = cdm_schemas["ContigCollection_x_Feature"]
-CONTIG_COLLECTION_X_PROTEIN_SCHEMA = cdm_schemas["ContigCollection_x_Protein"]
-FEATURE_X_PROTEIN_SCHEMA = cdm_schemas["Feature_x_Protein"]
-CONTIG_SCHEMA = cdm_schemas["Contig"]
-CONTIG_X_CONTIG_COLLECTION_SCHEMA = cdm_schemas["Contig_x_ContigCollection"]
-
-
 # ---------------------------------------------------------------------
 # CDM PREFIX NORMALIZATION
 # ---------------------------------------------------------------------
@@ -160,19 +146,21 @@ def load_feature_records(data: dict) -> list[tuple]:
                     "minus": "negative",
                     "unstranded": "unstranded",
                 }.get(r.get("orientation"), "unknown")
-                features.append((
-                    feature_id,
-                    None,
-                    None,
-                    None,
-                    to_int(r.get("end")),
-                    None,
-                    to_int(r.get("begin")),
-                    strand,
-                    "RefSeq",
-                    None,
-                    "gene",
-                ))
+                features.append(
+                    (
+                        feature_id,
+                        None,
+                        None,
+                        None,
+                        to_int(r.get("end")),
+                        None,
+                        to_int(r.get("begin")),
+                        strand,
+                        "RefSeq",
+                        None,
+                        "gene",
+                    )
+                )
     return features
 
 
@@ -270,10 +258,12 @@ def load_contig_x_contig_collection(data: dict) -> list[tuple[str, str]]:
         assembly = annotations[0].get("assembly_accession")
 
         if contig and assembly:
-            links.append((
-                f"refseq:{contig}",
-                apply_prefix(assembly),
-            ))
+            links.append(
+                (
+                    f"refseq:{contig}",
+                    apply_prefix(assembly),
+                )
+            )
 
     return links
 
@@ -285,27 +275,27 @@ def write_to_table(
     spark: SparkSession,
     records: list[tuple],
     table_name: str,
-    schema: StructType,
     database: str = "default",
 ) -> None:
     if records:
-        spark.createDataFrame(records, schema).write.format("delta").mode("overwrite").option(
+        spark.createDataFrame(records, cdm_schemas[table_name]).write.format("delta").mode("overwrite").option(
             "overwriteSchema", "true"
         ).saveAsTable(f"{database}.{table_name}")
 
 
 # ---------------------------------------------------------------------
 # SQL PREVIEW
 # ---------------------------------------------------------------------
+
 CDM_TABLES = [
-    "cdm_identifiers",
-    "cdm_names",
-    "cdm_features",
-    "cdm_contig_collection_x_feature",
-    "cdm_contig_collection_x_protein",
-    "cdm_feature_x_protein",
-    "cdm_contigs",
-    "cdm_contig_x_contig_collection",
+    "Identifier",
+    "Name",
+    "Feature",
+    "ContigCollection_x_Feature",
+    "ContigCollection_x_Protein",
+    "Feature_x_Protein",
+    "Contig",
+    "Contig_x_ContigCollection",
 ]
 
 
@@ -316,6 +306,68 @@ def run_sql_query(spark: SparkSession, database: str = "default") -> None:
         spark.sql(f"SELECT * FROM {table} LIMIT 20").show(truncate=False)
 
 
+def parse_annotation_data(spark: SparkSession, datasets: list[dict], namespace: str) -> None:
+    # -----------------------------------------
+    # Parse and write CDM tables
+    # -----------------------------------------
+    for data in datasets:
+        write_to_table(
+            spark,
+            load_identifiers(data),
+            "Identifier",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_names(data),
+            "Name",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_feature_records(data),
+            "Feature",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_collection_x_feature(data),
+            "ContigCollection_x_Feature",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_collection_x_protein(data),
+            "ContigCollection_x_Protein",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_feature_x_protein(data),
+            "Feature_x_Protein",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contigs(data),
+            "Contig",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_x_contig_collection(data),
+            "Contig_x_ContigCollection",
+            namespace,
+        )
+
+
 # ---------------------------------------------------------------------
 # CLI ENTRY
 # ---------------------------------------------------------------------
@@ -383,73 +435,7 @@ def main():
             with open(path) as f:
                 datasets.append(json.load(f))
 
-    # -----------------------------------------
-    # Parse and write CDM tables
-    # -----------------------------------------
-    for data in datasets:
-        write_to_table(
-            spark,
-            load_identifiers(data),
-            "cdm_identifiers",
-            IDENTIFIER_SCHEMA,
-            args.namespace,
-        )
-
-        write_to_table(
-            spark,
-            load_names(data),
-            "cdm_names",
-            NAME_SCHEMA,
-            args.namespace,
-        )
-
-        write_to_table(
-            spark,
-            load_feature_records(data),
-            "cdm_features",
-            FEATURE_SCHEMA,
-            args.namespace,
-        )
-
-        write_to_table(
-            spark,
-            load_contig_collection_x_feature(data),
-            "cdm_contig_collection_x_feature",
-            CONTIG_COLLECTION_X_FEATURE_SCHEMA,
-            args.namespace,
-        )
-
-        write_to_table(
-            spark,
-            load_contig_collection_x_protein(data),
-            "cdm_contig_collection_x_protein",
-            CONTIG_COLLECTION_X_PROTEIN_SCHEMA,
-            args.namespace,
-        )
-
-        write_to_table(
-            spark,
-            load_feature_x_protein(data),
-            "cdm_feature_x_protein",
-            FEATURE_X_PROTEIN_SCHEMA,
-            args.namespace,
-        )
-
-        write_to_table(
-            spark,
-            load_contigs(data),
-            "cdm_contigs",
-            CONTIG_SCHEMA,
-            args.namespace,
-        )
-
-        write_to_table(
-            spark,
-            load_contig_x_contig_collection(data),
-            "cdm_contig_x_contig_collection",
-            CONTIG_X_CONTIG_COLLECTION_SCHEMA,
-            args.namespace,
-        )
+    parse_annotation_data(spark, datasets, args.namespace)
 
     # -----------------------------------------
     # SQL preview
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,7 +7,7 @@
 from typing import Any
 
 import pytest
-from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import (
     ArrayType,
     BooleanType,
diff --git a/tests/parsers/test_annotation_parse.py b/tests/parsers/test_annotation_parse.py
@@ -1,9 +1,11 @@
 import json
 from pathlib import Path
-import pytest
 
+import pytest
+from pyspark.sql import SparkSession
 
 from src.cdm_data_loader_utils.parsers.annotation_parse import (
+    apply_prefix,
     load_contig_collection_x_feature,
     load_contig_collection_x_protein,
     load_contig_x_contig_collection,
@@ -12,9 +14,10 @@
     load_feature_x_protein,
     load_identifiers,
     load_names,
-    apply_prefix,
+    parse_annotation_data,
     to_int,
 )
+from tests.conftest import TEST_NS
 
 
 @pytest.mark.parametrize(
@@ -709,3 +712,31 @@ def test_apply_prefix(input_id, expected):
 @pytest.mark.parametrize("val, expected", [("123", 123), ("abc", None), ("", None)])
 def test_to_int(val, expected):
     assert to_int(val) == expected
+
+
+@pytest.mark.requires_spark
+def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None:
+    """Test the parsing of the annotation data."""
+    test_ns = TEST_NS.lower()
+    spark.sql(f"CREATE DATABASE IF NOT EXISTS {TEST_NS}")
+    sample_api_response = test_data_dir / "refseq" / "annotation_report.json"
+    # a single dataset
+    dataset = json.load(sample_api_response.open())
+    expected_resp_path = test_data_dir / "refseq" / "annotation_report.parsed.json"
+    expected = json.load(expected_resp_path.open())
+
+    parse_annotation_data(spark, [dataset], TEST_NS)
+
+    for table_name, rows in expected.items():
+        # correct number of rows
+        result_df = spark.table(f"{test_ns}.{table_name}")
+        assert result_df.count() == len(rows)
+        # EITHER run r.asDict() to compare the data in dictionary form -- but watch out for the row order
+        output = [r.asDict() for r in result_df.collect()]
+        assert [r.asDict() for r in result_df.collect()] == rows
+        # OR create a dataframe from the expected rows and the schema and compare the dataframes directly
+
+
+#         expected_df = spark.createDataFrame(rows, schema=cdm_schema[table_name])
+#         assertDataFrameSchemaEqual(expected_df, result_df)
+#         assertDataFrameEqual(expected_df, result_df)
diff --git a/tests/utils/test_spark_delta.py b/tests/utils/test_spark_delta.py
@@ -29,17 +29,6 @@
 TENANT_NAME = "The_Breakers"
 
 
-<<<<<<< HEAD
-=======
-@pytest.fixture
-def spark(tmp_path: Path) -> Generator[SparkSession, Any]:
-    """Generate a spark session with spark.sql.warehouse.dir set to the pytest temporary directory."""
-    spark = get_spark("test_delta_app", local=True, delta_lake=True, override={SAVE_DIR: tmp_path})
-    yield spark
-    spark.stop()
-
-
->>>>>>> 01a355b (Restore accidentally deleted files)
 def gen_ns_save_dir(current_save_dir: str, namespace: str, tenant_name: str | None) -> tuple[str, str]:
     """Generate the projected namespace and save directory, given a file path, a namespace, and a tenant name."""
     db_location = f"tenant/{tenant_name}/{namespace}.db" if tenant_name else f"user/some_user/{namespace}.db"