Add papermill-based tests for PyIceberg examples (#3330)

federicsp · web-flow · commit 95c45d44094c · 2026-05-15T10:13:54.000+02:00
Closes #3328 # Rationale for this change `pyiceberg_example.ipynb` and `spark_integration_example.ipynb` had no automated test coverage. Breaking changes to notebook cells could go undetected in CI. This PR adds papermill-based tests that execute the real notebooks as-is, so any change to a cell is automatically reflected in the tests. ## Are these changes tested? Yes. The tests themselves are the change. Run them with: ```bash make test-notebook ``` ## Are there any user-facing changes? No.
diff --git a/Makefile b/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 .PHONY: help install install-uv check-license lint \
         test test-integration test-integration-setup test-integration-exec test-integration-cleanup test-integration-rebuild \
-        test-s3 test-adls test-gcs test-coverage coverage-report \
+        test-s3 test-adls test-gcs test-coverage coverage-report test test-notebook\
         docs-serve docs-build notebook notebook-infra \
         clean
 
@@ -150,6 +150,9 @@ coverage-report: ## Combine and report coverage
 	uv run $(PYTHON_ARG) coverage html
 	uv run $(PYTHON_ARG) coverage xml
 
+test-notebook: ## Run notebook tests (pyiceberg_example and spark_integration_example) via papermill
+	$(TEST_RUNNER) pytest tests/notebooks/test_pyiceberg_example.py tests/notebooks/test_spark_integration_example.py -m notebook $(PYTEST_ARGS)
+
 # ================
 # Documentation
 # ================
diff --git a/pyproject.toml b/pyproject.toml
@@ -122,6 +122,9 @@ dev = [
     "google-cloud-bigquery>=3.33.0,<4",
     "pyarrow-stubs>=20.0.0.20251107", # Remove when pyarrow >= 23.0.0 https://github.com/apache/arrow/pull/47609
     "sqlalchemy>=2.0.18,<3",
+    "papermill>=2.6.0",
+    "nbformat>=5.10.0",
+    "ipykernel>=6.29.0",
 ]
 # for mkdocs
 docs = [
@@ -161,6 +164,7 @@ markers = [
   "integration: marks integration tests against Apache Spark",
   "gcs: marks a test as requiring access to gcs compliant storage (use with --gs.token, --gs.project, and --gs.endpoint)",
   "benchmark: collection of tests to validate read/write performance before and after a change",
+  "notebook: marks tests that execute Jupyter notebooks via papermill",
 ]
 
 # Turns a warning into an error
diff --git a/tests/notebooks/test_pyiceberg_example.py b/tests/notebooks/test_pyiceberg_example.py
@@ -0,0 +1,101 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+from pathlib import Path
+
+import nbformat
+import papermill as pm
+import pytest
+
+pytestmark = pytest.mark.notebook
+
+NOTEBOOK_PATH = Path(__file__).parents[2] / "notebooks" / "pyiceberg_example.ipynb"
+
+
+def get_all_stdout(nb: nbformat.NotebookNode) -> str:
+    """Concatenate all stdout streams from every executed cell."""
+    return "".join(
+        out.get("text", "")
+        for cell in nb.cells
+        for out in cell.get("outputs", [])
+        if out.get("output_type") == "stream" and out.get("name") == "stdout"
+    )
+
+
+@pytest.fixture(scope="session")
+def pyiceberg_nb(tmp_path_factory: pytest.TempPathFactory) -> nbformat.NotebookNode:
+    out = tmp_path_factory.mktemp("nb_out") / "pyiceberg_example_out.ipynb"
+    return pm.execute_notebook(str(NOTEBOOK_PATH), str(out), kernel_name="python3")
+
+
+class TestSmoke:
+    def test_notebook_completes_without_error(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        """papermill raises PapermillExecutionError if any cell fails."""
+        assert pyiceberg_nb is not None
+
+    def test_all_code_cells_executed(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        for cell in pyiceberg_nb.cells:
+            if cell.cell_type == "code":
+                assert cell.get("execution_count") is not None, f"Cell not executed:\n{cell.source[:80]}"
+
+
+class TestCellOutputs:
+    def test_pyiceberg_version_printed(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "PyIceberg version:" in get_all_stdout(pyiceberg_nb)
+
+    def test_warehouse_location_printed(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert "Warehouse location:" in stdout
+        assert "iceberg_warehouse_" in stdout
+
+    def test_catalog_loaded_successfully(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "Catalog loaded successfully!" in get_all_stdout(pyiceberg_nb)
+
+    def test_namespace_default_created(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "default" in get_all_stdout(pyiceberg_nb)
+
+    def test_rows_written_is_five(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "Rows written: 5" in get_all_stdout(pyiceberg_nb)
+
+    def test_schema_evolved_message(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "Schema evolved!" in get_all_stdout(pyiceberg_nb)
+
+    def test_tip_per_mile_column_present_after_evolution(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "tip_per_mile" in get_all_stdout(pyiceberg_nb)
+
+    def test_filter_result_is_positive(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        """The notebook prints 'Rows with tip_per_mile > 1.0: N' — N must be > 0."""
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert "Rows with tip_per_mile > 1.0:" in stdout
+        for line in stdout.splitlines():
+            if "Rows with tip_per_mile > 1.0:" in line:
+                count = int(line.split(":")[-1].strip())
+                assert count > 0
+                break
+
+    def test_snapshot_id_printed(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "Current snapshot ID:" in get_all_stdout(pyiceberg_nb)
+
+    def test_table_history_has_entries(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert "Table history:" in stdout
+        assert "Snapshot:" in stdout
+
+    def test_warehouse_contains_parquet_and_metadata_files(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert ".parquet" in stdout
+        assert ".metadata.json" in stdout
diff --git a/tests/notebooks/test_spark_integration_example.py b/tests/notebooks/test_spark_integration_example.py
@@ -0,0 +1,170 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+import textwrap
+from pathlib import Path
+
+import nbformat
+import papermill as pm
+import pytest
+
+pytestmark = pytest.mark.notebook
+
+NOTEBOOK_PATH = Path(__file__).parents[2] / "notebooks" / "spark_integration_example.ipynb"
+
+# ---------------------------------------------------------------------------
+# Mock pyspark
+# Replaces pyspark.sql.SparkSession with a fake one
+# ---------------------------------------------------------------------------
+_MOCK_PYSPARK = textwrap.dedent("""\
+    import sys
+    import types
+    from unittest.mock import MagicMock
+
+    def _make_fake_pyspark():
+        pyspark_mod = types.ModuleType("pyspark")
+        sql_mod     = types.ModuleType("pyspark.sql")
+        pyspark_mod.sql = sql_mod
+        sys.modules.setdefault("pyspark",     pyspark_mod)
+        sys.modules.setdefault("pyspark.sql", sql_mod)
+        return pyspark_mod, sql_mod
+
+    _pyspark, _sql = _make_fake_pyspark()
+
+    _SHOW_CATALOGS = (
+        "+-------------+\\n"
+        "|catalogName  |\\n"
+        "+-------------+\\n"
+        "|spark_catalog|\\n"
+        "|local        |\\n"
+        "+-------------+\\n"
+    )
+    _SHOW_NAMESPACES = (
+        "+---------+\\n"
+        "|namespace|\\n"
+        "+---------+\\n"
+        "|default  |\\n"
+        "+---------+\\n"
+    )
+    _SHOW_TABLES = (
+        "+---------+-----------+-----------+\\n"
+        "|namespace|tableName  |isTemporary|\\n"
+        "+---------+-----------+-----------+\\n"
+        "|default  |test_all   |false      |\\n"
+        "+---------+-----------+-----------+\\n"
+    )
+    _DESCRIBE_TABLE = (
+        "+--------------------+---------+-------+\\n"
+        "|col_name            |data_type|comment|\\n"
+        "+--------------------+---------+-------+\\n"
+        "|boolean_col         |boolean  |null   |\\n"
+        "|integer_col         |integer  |null   |\\n"
+        "+--------------------+---------+-------+\\n"
+    )
+    _SQL_RESPONSES = {
+        "SHOW CATALOGS":                        _SHOW_CATALOGS,
+        "SHOW NAMESPACES":                       _SHOW_NAMESPACES,
+        "SHOW TABLES FROM default":              _SHOW_TABLES,
+        "DESCRIBE TABLE default.test_all_types": _DESCRIBE_TABLE,
+    }
+
+    def _make_df(output):
+        df = MagicMock()
+        df.show.side_effect = lambda *a, **kw: print(output, end="")
+        return df
+
+    class _FakeBuilder:
+        def remote(self, url): return self
+        def getOrCreate(self): return _FakeSession()
+
+    class _FakeSession:
+        builder = _FakeBuilder()
+        def sql(self, query):
+            key = query.strip().rstrip(";")
+            output = _SQL_RESPONSES.get(key, "+------+\\n| col  |\\n+------+\\n| val  |\\n+------+\\n")
+            return _make_df(output)
+
+    _FakeSparkSession = MagicMock(spec=object)
+    _FakeSparkSession.builder = _FakeBuilder()
+    _sql.SparkSession = _FakeSparkSession
+""")
+
+
+def get_all_stdout(nb: nbformat.NotebookNode) -> str:
+    """Concatenate all stdout streams from every executed cell."""
+    return "".join(
+        out.get("text", "")
+        for cell in nb.cells
+        for out in cell.get("outputs", [])
+        if out.get("output_type") == "stream" and out.get("name") == "stdout"
+    )
+
+
+def _inject_mock_and_execute(notebook_path: Path, output_path: Path) -> nbformat.NotebookNode:
+    """
+    Load the real notebook, prepend the mock-pyspark setup cell, write to a
+    temporary copy and execute it with papermill.
+    """
+    nb = nbformat.read(str(notebook_path), as_version=4)
+
+    mock_cell = nbformat.v4.new_code_cell(_MOCK_PYSPARK)
+    mock_cell.metadata["tags"] = ["injected-mock"]
+    nb.cells.insert(0, mock_cell)
+
+    patched_path = output_path.parent / "spark_patched.ipynb"
+    nbformat.write(nb, str(patched_path))
+
+    return pm.execute_notebook(str(patched_path), str(output_path), kernel_name="python3")
+
+
+@pytest.fixture(scope="session")
+def spark_nb(tmp_path_factory: pytest.TempPathFactory) -> nbformat.NotebookNode:
+    out = tmp_path_factory.mktemp("nb_out") / "spark_integration_example_out.ipynb"
+    return _inject_mock_and_execute(NOTEBOOK_PATH, out)
+
+
+class TestSmoke:
+    def test_notebook_completes_without_error(self, spark_nb: nbformat.NotebookNode) -> None:
+        assert spark_nb is not None
+
+    def test_all_code_cells_executed(self, spark_nb: nbformat.NotebookNode) -> None:
+        for cell in spark_nb.cells:
+            if cell.cell_type == "code":
+                assert cell.get("execution_count") is not None, f"Cell not executed:\n{cell.source[:80]}"
+
+
+class TestCellOutputs:
+    def test_show_catalogs_lists_spark_catalog_and_local(self, spark_nb: nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(spark_nb)
+        assert "spark_catalog" in stdout
+        assert "local" in stdout
+
+    def test_show_namespaces_contains_default(self, spark_nb: nbformat.NotebookNode) -> None:
+        assert "default" in get_all_stdout(spark_nb)
+
+    def test_show_tables_produces_tabular_output(self, spark_nb: nbformat.NotebookNode) -> None:
+        assert "+---------+-----------+-----------+" in get_all_stdout(spark_nb)
+
+    def test_describe_table_lists_column_names(self, spark_nb: nbformat.NotebookNode) -> None:
+        assert "col_name" in get_all_stdout(spark_nb)
+
+    def test_describe_table_lists_data_types(self, spark_nb: nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(spark_nb)
+        assert "boolean" in stdout or "integer" in stdout
+
+    def test_show_tables_includes_test_table_row(self, spark_nb: nbformat.NotebookNode) -> None:
+        assert "test_all" in get_all_stdout(spark_nb)
diff --git a/uv.lock b/uv.lock