test(integration): run arrow scenarios in CI instead of skipping

zfarrell · zfarrell · commit 9e32d72da9cf · 2026-06-05T20:36:11.000-07:00
diff --git a/test-requirements.txt b/test-requirements.txt
@@ -4,3 +4,7 @@ tox >= 3.9.0
 flake8 >= 4.0.0
 types-python-dateutil >= 2.8.19.14
 mypy >= 1.5
+# pyarrow backs the `arrow` extra. Required here (not just an optional extra) so
+# the arrow integration scenarios actually run in CI instead of silently
+# skipping via importorskip. Keep the floor in sync with pyproject's extra.
+pyarrow >= 14
diff --git a/tests/integration/test_managed_tables_lifecycle.py b/tests/integration/test_managed_tables_lifecycle.py
@@ -6,33 +6,41 @@
   1. declare a schema and a table on the database's default catalog connection,
   2. upload a small parquet file,
   3. load it into the table (load_managed_table),
-  4. read get_table_profile,
-  5. refresh the catalog metadata,
-  6. purge_table_cache,
-  7. delete the managed table.
+  4. poll get_table_profile until the load syncs,
+  5. purge_table_cache,
+  6. delete the managed table.
+
+Note on managed-catalog semantics: there is no `refresh` step. `refresh` is
+rejected with a 400 on a managed catalog ("use the loads endpoint to update its
+data") — `load_managed_table` is itself the load. The profile is populated
+asynchronously after the load, so step 4 polls get_table_profile (a 404 means
+"not synced yet") rather than reading it once.
 
 The scratch_database fixture tears the database (and its catalog) down, so the
-test touches no seeded data. Skipped if pyarrow is unavailable (needed to author
-the parquet payload).
+test touches no seeded data. pyarrow is a hard test dependency (see
+test-requirements.txt) and is imported directly — a missing pyarrow must fail
+loudly, never silently skip this scenario in CI.
 """
 
 from __future__ import annotations
 
 import io
+import time
 
-import pytest
-
-pa = pytest.importorskip("pyarrow")
-pq = pytest.importorskip("pyarrow.parquet")
+import pyarrow as pa
+import pyarrow.parquet as pq
 
 from hotdata.api.connections_api import ConnectionsApi
 from hotdata.api.databases_api import DatabasesApi
-from hotdata.api.refresh_api import RefreshApi
 from hotdata.api.uploads_api import UploadsApi
+from hotdata.exceptions import ApiException
 from hotdata.models.add_managed_schema_request import AddManagedSchemaRequest
 from hotdata.models.add_managed_table_request import AddManagedTableRequest
 from hotdata.models.load_managed_table_request import LoadManagedTableRequest
-from hotdata.models.refresh_request import RefreshRequest
+
+
+PROFILE_SYNC_TIMEOUT_S = 60.0
+PROFILE_POLL_INTERVAL_S = 2.0
 
 
 def _parquet_bytes() -> bytes:
@@ -46,7 +54,6 @@ def test_managed_tables_lifecycle(
     databases_api: DatabasesApi,
     connections_api: ConnectionsApi,
     uploads_api: UploadsApi,
-    refresh_api: RefreshApi,
     scratch_database: str,
 ) -> None:
     # The database's auto-provisioned default catalog is a managed catalog,
@@ -77,15 +84,27 @@ def test_managed_tables_lifecycle(
     assert loaded.table_name == table_name
     assert loaded.row_count == 3
 
-    profile = connections_api.get_table_profile(connection_id, schema_name, table_name)
+    # The profile syncs asynchronously after the load — get_table_profile 404s
+    # ("Table may not be synced yet") until it lands. Poll instead of reading
+    # once. There is no manual trigger to force this: refresh is rejected on a
+    # managed catalog, and load_managed_table is the load.
+    deadline = time.monotonic() + PROFILE_SYNC_TIMEOUT_S
+    profile = None
+    while time.monotonic() < deadline:
+        try:
+            profile = connections_api.get_table_profile(
+                connection_id, schema_name, table_name
+            )
+            break
+        except ApiException as exc:
+            if exc.status != 404:
+                raise
+            time.sleep(PROFILE_POLL_INTERVAL_S)
+    assert profile is not None, "table profile never synced after load"
     assert profile.var_schema == schema_name
     assert profile.table == table_name
     assert profile.row_count == 3
 
-    # Refresh the catalog metadata for the managed connection.
-    refreshed = refresh_api.refresh(RefreshRequest(connection_id=connection_id))
-    assert refreshed.actual_instance is not None
-
     # purge_table_cache and delete_managed_table both return None on success.
     connections_api.purge_table_cache(connection_id, schema_name, table_name)
     connections_api.delete_managed_table(connection_id, schema_name, table_name)
diff --git a/tests/integration/test_results_arrow.py b/tests/integration/test_results_arrow.py
@@ -5,17 +5,18 @@
 that Arrow IPC content negotiation works end-to-end and that the streaming
 variant yields the same data.
 
-Skipped if pyarrow is not installed (the helper requires the ``arrow`` extra).
+pyarrow is a hard test dependency (see test-requirements.txt), so this imports
+it directly rather than via importorskip — a missing pyarrow must fail loudly,
+never silently skip this scenario in CI.
 """
 
 from __future__ import annotations
 
 import time
 
+import pyarrow as pa
 import pytest
 
-pa = pytest.importorskip("pyarrow")
-
 from hotdata.api.query_api import QueryApi
 from hotdata.api.query_runs_api import QueryRunsApi
 from hotdata.arrow import ResultsApi