From 5ec03a9700f77ba6dd465e84b4775d4a7e4323e9 Mon Sep 17 00:00:00 2001 From: "Daniel Q. Kim" Date: Wed, 20 May 2026 09:24:50 +0200 Subject: [PATCH 01/33] Fix Iceberg read optimization returning NULLs for stats-less manifests When an Iceberg manifest's per-file column statistics are absent or empty (common for non-Spark writers like pyiceberg with default settings), DataFileMetaInfo::columns_info is empty. The optimization in StorageObjectStorageSource::createReader misread this as "all columns are absent from the file" and returned constant NULLs for every row while still returning the correct row count. Result: silent data loss on icebergLocal, icebergS3, icebergAzure, icebergHDFS, and all *Cluster variants. Gate the optimization's absent-NULL loop directly on columns_info.empty() instead of introducing a separate stats-presence flag. When no usable per-column stats were parsed -- whether the manifest omitted the stats fields entirely or declared them but left them empty -- fall through to the Parquet reader, which correctly handles physically-present columns (read normally) and schema-evolved-absent columns (handled by IcebergMetadata::getInitialSchemaByPath setting the file's own schema as initial_header). columns_info is already serialized to workers in the cluster JSON path, so this changes no serialization format and keeps the fork's DataFileMetaInfo serde identical to upstream. Closes Altinity/ClickHouse#1545. Mirror of Altinity/ClickHouse#1688 (antalya-25.8 fix). Signed-off-by: Daniel Q. Kim --- .../StorageObjectStorageSource.cpp | 59 +- ...t_iceberg_read_optimization_empty_stats.py | 685 ++++++++++++++++++ 2 files changed, 716 insertions(+), 28 deletions(-) create mode 100644 tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index ee0cf191045c..a4a92ab58599 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -886,40 +886,43 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade } } } - for (const auto & column : requested_columns_list) + if (file_meta_data.has_value() && !file_meta_data.value()->columns_info.empty()) { - const auto & column_name = column.first; + for (const auto & column : requested_columns_list) + { + const auto & column_name = column.first; - if (file_meta_data.value()->columns_info.contains(column_name)) - continue; + if (file_meta_data.value()->columns_info.contains(column_name)) + continue; - if (!column.second.second.type->isNullable()) - continue; + if (!column.second.second.type->isNullable()) + continue; - /// With View over Iceberg table we have someting like 'materialize(time)' as column_name - /// Simple cheap check - if (column_name.starts_with("materialize(") && column_name.ends_with(")")) - continue; + /// With View over Iceberg table we have someting like 'materialize(time)' as column_name + /// Simple cheap check + if (column_name.starts_with("materialize(") && column_name.ends_with(")")) + continue; - /// Skip columns produced by prewhere or row-level filter expressions — - /// they are computed at read time, not stored in the file. - if (format_filter_info - && ((format_filter_info->prewhere_info && column_name == format_filter_info->prewhere_info->prewhere_column_name) - || (format_filter_info->row_level_filter && column_name == format_filter_info->row_level_filter->column_name))) - continue; + /// Skip columns produced by prewhere or row-level filter expressions — + /// they are computed at read time, not stored in the file. + if (format_filter_info + && ((format_filter_info->prewhere_info && column_name == format_filter_info->prewhere_info->prewhere_column_name) + || (format_filter_info->row_level_filter && column_name == format_filter_info->row_level_filter->column_name))) + continue; - /// Column is nullable and absent in file - constant_columns_with_values[column.second.first] = - ConstColumnWithValue{ - column.second.second, - Field() - }; - constant_columns.insert(column_name); - - LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'", - object_info->getPath(), - column_name, - column.second.second.type); + /// Column is nullable and absent in file + constant_columns_with_values[column.second.first] = + ConstColumnWithValue{ + column.second.second, + Field() + }; + constant_columns.insert(column_name); + + LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'", + object_info->getPath(), + column_name, + column.second.second.type); + } } } diff --git a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py new file mode 100644 index 000000000000..34dc2fda2918 --- /dev/null +++ b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py @@ -0,0 +1,685 @@ +#!/usr/bin/env python3 +""" +Reproducer for Altinity/ClickHouse#1545: +icebergLocal() returns all-NULL columns when allow_experimental_iceberg_read_optimization=1 +and the Iceberg manifest has no column statistics. + +Root cause +---------- +The manifest Avro writer schema intentionally omits the optional stats fields +(value_counts, column_sizes, null_value_counts, lower_bounds, upper_bounds). +AvroForIcebergDeserializer.hasPath() therefore returns false for all three stats +paths in ManifestFile.cpp. columns_infos is left empty. + +StorageObjectStorageSource::createReader's second loop (the schema-evolution +"absent column as NULL" path) iterates over every requested column. Because +columns_infos is empty, every nullable column is absent → each is injected as a +constant NULL. With all requested columns constant, need_only_count is set to +true and the Parquet file is read in count-only mode; the output is the correct +number of rows, but every value is NULL. + +Tests +----- +- test_iceberg_local_returns_actual_rows_with_stats_less_manifest + FAILS on unpatched code: returns all-NULL rows instead of real data. + +- test_iceberg_local_returns_correct_rows_when_optimization_disabled + PASSES on unpatched code: regression guard — optimization=0 bypasses the + buggy path and reads the Parquet file normally. +""" + +import json +import os +import tempfile +import time +import uuid + +import avro.datafile +import avro.io +import avro.schema +import pyarrow as pa +import pyarrow.parquet as pq + +from helpers.iceberg_utils import get_uuid_str +from helpers.s3_tools import LocalUploader + + +# Iceberg v2 manifest list Avro schema (minimal fields needed by ClickHouse). +_MANIFEST_LIST_SCHEMA_STR = json.dumps({ + "type": "record", + "name": "manifest_file", + "fields": [ + {"name": "manifest_path", "type": "string"}, + {"name": "manifest_length", "type": "long"}, + {"name": "partition_spec_id", "type": "int"}, + {"name": "content", "type": "int"}, + {"name": "sequence_number", "type": "long"}, + {"name": "min_sequence_number", "type": "long"}, + {"name": "added_snapshot_id", "type": "long"}, + {"name": "added_files_count", "type": "int"}, + {"name": "existing_files_count", "type": "int"}, + {"name": "deleted_files_count", "type": "int"}, + {"name": "added_rows_count", "type": "long"}, + {"name": "existing_rows_count", "type": "long"}, + {"name": "deleted_rows_count", "type": "long"}, + ], +}) + +# Manifest entry Avro schema that deliberately omits all per-column stats fields: +# column_sizes, value_counts, null_value_counts, lower_bounds, upper_bounds. +# +# Absent fields → AvroForIcebergDeserializer.hasPath() returns false for each → +# ManifestFile.cpp skips stats collection → columns_infos is empty → +# the optimization treats every nullable column as a "schema-evolution absent column" +# and injects a constant NULL. +_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = json.dumps({ + "type": "record", + "name": "manifest_entry", + "fields": [ + {"name": "status", "type": "int"}, + {"name": "snapshot_id", "type": ["null", "long"]}, + {"name": "sequence_number", "type": ["null", "long"]}, + {"name": "file_sequence_number", "type": ["null", "long"]}, + { + "name": "data_file", + "type": { + "type": "record", + "name": "r2", + "fields": [ + {"name": "content", "type": "int"}, + {"name": "file_path", "type": "string"}, + {"name": "file_format", "type": "string"}, + { + "name": "partition", + "type": {"type": "record", "name": "r102", "fields": []}, + }, + {"name": "record_count", "type": "long"}, + {"name": "file_size_in_bytes", "type": "long"}, + ], + }, + }, + ], +}) + + +def _write_avro(schema, records, path, metadata=None): + with open(path, "wb") as f: + writer = avro.datafile.DataFileWriter(f, avro.io.DatumWriter(), schema) + if metadata: + for k, v in metadata.items(): + writer.set_meta(k, v if isinstance(v, bytes) else v.encode("utf-8")) + for rec in records: + writer.append(rec) + writer.close() + + +def _create_stats_less_iceberg_table(tmpdir, table_name, container_base): + """ + Build a minimal Iceberg v2 table under tmpdir/table_name. The manifest is + written with a stats-less Avro schema to trigger the optimization bug. + + container_base is the absolute path inside the ClickHouse container where the + table will be placed after upload. File paths embedded in the Avro records and + metadata.json must reference container_base because they are interpreted at + query time inside the container. + + Returns the local table directory path (tmpdir/table_name). + """ + table_local = os.path.join(tmpdir, table_name) + data_local = os.path.join(table_local, "data") + meta_local = os.path.join(table_local, "metadata") + os.makedirs(data_local) + os.makedirs(meta_local) + + # Parquet data file with two nullable columns. + arrow_schema = pa.schema([ + pa.field("id", pa.int32(), nullable=True), + pa.field("data", pa.string(), nullable=True), + ]) + pq.write_table( + pa.table( + { + "id": pa.array([1, 2, 3], type=pa.int32()), + "data": pa.array(["hello", "world", "iceberg"], type=pa.string()), + }, + schema=arrow_schema, + ), + os.path.join(data_local, "00000-0-data.parquet"), + ) + data_size = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet")) + data_container_path = f"{container_base}/data/00000-0-data.parquet" + + snapshot_id = 1 + seq_number = 1 + ts_ms = int(time.time() * 1000) + + # Iceberg schema and partition-spec to embed as Avro file-level metadata. + # ManifestFile.cpp requires both keys to be present in the manifest's Avro header. + iceberg_schema = { + "type": "struct", + "schema-id": 0, + "fields": [ + {"id": 1, "name": "id", "required": False, "type": "int"}, + {"id": 2, "name": "data", "required": False, "type": "string"}, + ], + } + + # Manifest file (stats-less Avro schema). + manifest_local_path = os.path.join(meta_local, "00000-0-manifest.avro") + manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro" + _write_avro( + avro.schema.parse(_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR), + [{ + "status": 1, # ADDED + "snapshot_id": snapshot_id, + "sequence_number": seq_number, + "file_sequence_number": seq_number, + "data_file": { + "content": 0, # DATA + "file_path": data_container_path, + "file_format": "PARQUET", + "partition": {}, + "record_count": 3, + "file_size_in_bytes": data_size, + }, + }], + manifest_local_path, + metadata={ + "schema": json.dumps(iceberg_schema), + "partition-spec": "[]", + }, + ) + manifest_size = os.path.getsize(manifest_local_path) + + # Manifest list. + mlist_filename = f"snap-{snapshot_id}-0-manifest-list.avro" + mlist_local_path = os.path.join(meta_local, mlist_filename) + mlist_container_path = f"{container_base}/metadata/{mlist_filename}" + _write_avro( + avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR), + [{ + "manifest_path": manifest_container_path, + "manifest_length": manifest_size, + "partition_spec_id": 0, + "content": 0, # DATA + "sequence_number": seq_number, + "min_sequence_number": seq_number, + "added_snapshot_id": snapshot_id, + "added_files_count": 1, + "existing_files_count": 0, + "deleted_files_count": 0, + "added_rows_count": 3, + "existing_rows_count": 0, + "deleted_rows_count": 0, + }], + mlist_local_path, + ) + + # Table metadata JSON. + metadata = { + "format-version": 2, + "table-uuid": str(uuid.uuid4()), + "location": container_base, + "last-sequence-number": seq_number, + "last-updated-ms": ts_ms, + "last-column-id": 2, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + {"id": 1, "name": "id", "required": False, "type": "int"}, + {"id": 2, "name": "data", "required": False, "type": "string"}, + ], + }], + "default-spec-id": 0, + "partition-specs": [{"spec-id": 0, "fields": []}], + "last-partition-id": 999, + "default-sort-order-id": 0, + "sort-orders": [{"order-id": 0, "fields": []}], + "properties": {}, + "current-snapshot-id": snapshot_id, + "snapshots": [{ + "snapshot-id": snapshot_id, + "sequence-number": seq_number, + "timestamp-ms": ts_ms, + "manifest-list": mlist_container_path, + "summary": {"operation": "append"}, + "schema-id": 0, + }], + "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}], + "metadata-log": [], + "refs": {"main": {"snapshot-id": snapshot_id, "type": "branch"}}, + } + with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f: + json.dump(metadata, f, indent=2) + + return table_local + + +def _upload_to_container(instance, local_table_dir, container_base): + uploader = LocalUploader(instance) + for root, _dirs, files in os.walk(local_table_dir): + for fname in files: + local_path = os.path.join(root, fname) + rel = os.path.relpath(local_path, local_table_dir) + uploader.upload_file(local_path, os.path.join(container_base, rel)) + + +# Manifest entry schema with value_counts only (partial stats). +# column_sizes and null_value_counts are absent. value_counts alone is enough to +# populate columns_info so the absent-NULL guard (columns_info.empty()) is not triggered. +_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR = json.dumps({ + "type": "record", + "name": "manifest_entry", + "fields": [ + {"name": "status", "type": "int"}, + {"name": "snapshot_id", "type": ["null", "long"]}, + {"name": "sequence_number", "type": ["null", "long"]}, + {"name": "file_sequence_number", "type": ["null", "long"]}, + { + "name": "data_file", + "type": { + "type": "record", + "name": "r2", + "fields": [ + {"name": "content", "type": "int"}, + {"name": "file_path", "type": "string"}, + {"name": "file_format", "type": "string"}, + { + "name": "partition", + "type": {"type": "record", "name": "r102", "fields": []}, + }, + {"name": "record_count", "type": "long"}, + {"name": "file_size_in_bytes", "type": "long"}, + # Only value_counts is present (column_sizes and null_value_counts absent). + { + "name": "value_counts", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "k81v81", + "fields": [ + {"name": "key", "type": "int"}, + {"name": "value", "type": "long"}, + ], + }, + }, + }, + ], + }, + }, + ], +}) + +# Manifest entry schema with all three stats fields (value_counts, column_sizes, +# null_value_counts). Represents a manifest written by a full-stats writer such as Spark. +_MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR = json.dumps({ + "type": "record", + "name": "manifest_entry", + "fields": [ + {"name": "status", "type": "int"}, + {"name": "snapshot_id", "type": ["null", "long"]}, + {"name": "sequence_number", "type": ["null", "long"]}, + {"name": "file_sequence_number", "type": ["null", "long"]}, + { + "name": "data_file", + "type": { + "type": "record", + "name": "r2", + "fields": [ + {"name": "content", "type": "int"}, + {"name": "file_path", "type": "string"}, + {"name": "file_format", "type": "string"}, + { + "name": "partition", + "type": {"type": "record", "name": "r102", "fields": []}, + }, + {"name": "record_count", "type": "long"}, + {"name": "file_size_in_bytes", "type": "long"}, + { + "name": "column_sizes", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "k81v81", + "fields": [ + {"name": "key", "type": "int"}, + {"name": "value", "type": "long"}, + ], + }, + }, + }, + { + "name": "value_counts", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "k81v81_vc", + "fields": [ + {"name": "key", "type": "int"}, + {"name": "value", "type": "long"}, + ], + }, + }, + }, + { + "name": "null_value_counts", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "k81v81_nc", + "fields": [ + {"name": "key", "type": "int"}, + {"name": "value", "type": "long"}, + ], + }, + }, + }, + ], + }, + }, + ], +}) + + +def _create_iceberg_table_with_schema(tmpdir, table_name, container_base, + manifest_schema_str, manifest_extra_fields=None): + """ + Build a minimal Iceberg v2 table. manifest_extra_fields is added to each + manifest entry's data_file record when using schemas that include stats. + """ + table_local = os.path.join(tmpdir, table_name) + data_local = os.path.join(table_local, "data") + meta_local = os.path.join(table_local, "metadata") + os.makedirs(data_local) + os.makedirs(meta_local) + + arrow_schema = pa.schema([ + pa.field("id", pa.int32(), nullable=True), + pa.field("data", pa.string(), nullable=True), + ]) + pq.write_table( + pa.table( + { + "id": pa.array([1, 2, 3], type=pa.int32()), + "data": pa.array(["hello", "world", "iceberg"], type=pa.string()), + }, + schema=arrow_schema, + ), + os.path.join(data_local, "00000-0-data.parquet"), + ) + data_size = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet")) + data_container_path = f"{container_base}/data/00000-0-data.parquet" + + snapshot_id = 1 + seq_number = 1 + ts_ms = int(time.time() * 1000) + + iceberg_schema = { + "type": "struct", + "schema-id": 0, + "fields": [ + {"id": 1, "name": "id", "required": False, "type": "int"}, + {"id": 2, "name": "data", "required": False, "type": "string"}, + ], + } + + data_file_record = { + "content": 0, + "file_path": data_container_path, + "file_format": "PARQUET", + "partition": {}, + "record_count": 3, + "file_size_in_bytes": data_size, + } + if manifest_extra_fields: + data_file_record.update(manifest_extra_fields) + + manifest_local_path = os.path.join(meta_local, "00000-0-manifest.avro") + manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro" + _write_avro( + avro.schema.parse(manifest_schema_str), + [{ + "status": 1, + "snapshot_id": snapshot_id, + "sequence_number": seq_number, + "file_sequence_number": seq_number, + "data_file": data_file_record, + }], + manifest_local_path, + metadata={ + "schema": json.dumps(iceberg_schema), + "partition-spec": "[]", + }, + ) + manifest_size = os.path.getsize(manifest_local_path) + + mlist_filename = f"snap-{snapshot_id}-0-manifest-list.avro" + mlist_local_path = os.path.join(meta_local, mlist_filename) + mlist_container_path = f"{container_base}/metadata/{mlist_filename}" + _write_avro( + avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR), + [{ + "manifest_path": manifest_container_path, + "manifest_length": manifest_size, + "partition_spec_id": 0, + "content": 0, + "sequence_number": seq_number, + "min_sequence_number": seq_number, + "added_snapshot_id": snapshot_id, + "added_files_count": 1, + "existing_files_count": 0, + "deleted_files_count": 0, + "added_rows_count": 3, + "existing_rows_count": 0, + "deleted_rows_count": 0, + }], + mlist_local_path, + ) + + metadata = { + "format-version": 2, + "table-uuid": str(uuid.uuid4()), + "location": container_base, + "last-sequence-number": seq_number, + "last-updated-ms": ts_ms, + "last-column-id": 2, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + {"id": 1, "name": "id", "required": False, "type": "int"}, + {"id": 2, "name": "data", "required": False, "type": "string"}, + ], + }], + "default-spec-id": 0, + "partition-specs": [{"spec-id": 0, "fields": []}], + "last-partition-id": 999, + "default-sort-order-id": 0, + "sort-orders": [{"order-id": 0, "fields": []}], + "properties": {}, + "current-snapshot-id": snapshot_id, + "snapshots": [{ + "snapshot-id": snapshot_id, + "sequence-number": seq_number, + "timestamp-ms": ts_ms, + "manifest-list": mlist_container_path, + "summary": {"operation": "append"}, + "schema-id": 0, + }], + "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}], + "metadata-log": [], + "refs": {"main": {"snapshot-id": snapshot_id, "type": "branch"}}, + } + with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f: + json.dump(metadata, f, indent=2) + + return table_local + + +def test_iceberg_local_returns_actual_rows_with_stats_less_manifest( + started_cluster_iceberg_no_spark, +): + """ + FAILS on unpatched code (Altinity#1545). + + With allow_experimental_iceberg_read_optimization=1 (the default), reading an + icebergLocal table whose manifest omits column statistics returns all-NULL rows + instead of real data. The correct output is 3 rows: (1,'hello'), (2,'world'), + (3,'iceberg'). + """ + instance = started_cluster_iceberg_no_spark.instances["node1"] + table_name = "test_opt_on_stats_less_" + get_uuid_str() + container_base = ( + f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}" + ) + + with tempfile.TemporaryDirectory() as tmpdir: + local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base) + _upload_to_container(instance, local_dir, container_base) + + result = instance.query( + f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)" + " ORDER BY id" + " SETTINGS allow_experimental_iceberg_read_optimization=1" + ).strip() + + assert result == "1\thello\n2\tworld\n3\ticeberg", ( + f"Got: {result!r}\n" + "All-NULL rows means the bug fired: empty columns_infos caused the " + "optimization to treat every nullable column as an absent schema-evolution " + "column and inject constant NULL (need_only_count=1)." + ) + + +def test_iceberg_local_returns_correct_rows_when_optimization_disabled( + started_cluster_iceberg_no_spark, +): + """ + PASSES on unpatched code. + + With allow_experimental_iceberg_read_optimization=0 the optimization block is + skipped entirely and the Parquet file is read normally. This test is a + regression guard: it must keep passing after the bug is fixed. + """ + instance = started_cluster_iceberg_no_spark.instances["node1"] + table_name = "test_opt_off_stats_less_" + get_uuid_str() + container_base = ( + f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}" + ) + + with tempfile.TemporaryDirectory() as tmpdir: + local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base) + _upload_to_container(instance, local_dir, container_base) + + result = instance.query( + f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)" + " ORDER BY id" + " SETTINGS allow_experimental_iceberg_read_optimization=0" + ).strip() + + assert result == "1\thello\n2\tworld\n3\ticeberg" + + +def test_iceberg_local_partial_stats_manifest_reads_correctly( + started_cluster_iceberg_no_spark, +): + """ + Regression test for Altinity#1545: partial stats (value_counts only). + + A manifest that includes value_counts but omits column_sizes and + null_value_counts still populates columns_info. The fix must treat the + columns as real (not absent), so real data is returned. + """ + instance = started_cluster_iceberg_no_spark.instances["node1"] + table_name = "test_opt_on_partial_stats_" + get_uuid_str() + container_base = ( + f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}" + ) + + # value_counts for field_id 1 (id) and 2 (data): 3 rows each. + extra = { + "value_counts": [ + {"key": 1, "value": 3}, + {"key": 2, "value": 3}, + ], + } + + with tempfile.TemporaryDirectory() as tmpdir: + local_dir = _create_iceberg_table_with_schema( + tmpdir, table_name, container_base, + _MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR, + manifest_extra_fields=extra, + ) + _upload_to_container(instance, local_dir, container_base) + + result = instance.query( + f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)" + " ORDER BY id" + " SETTINGS allow_experimental_iceberg_read_optimization=1" + ).strip() + + assert result == "1\thello\n2\tworld\n3\ticeberg", ( + f"Got: {result!r}\n" + "Partial-stats manifest (value_counts only) should not trigger the absent-NULL " + "path because columns_info is non-empty. All-NULL rows means the guard " + "fired incorrectly for partial-stats manifests." + ) + + +def test_iceberg_local_full_stats_manifest_reads_correctly( + started_cluster_iceberg_no_spark, +): + """ + Control test: full-stats manifest (Spark-like) must continue to return real data. + + A manifest with all three stats fields (column_sizes, value_counts, + null_value_counts) is the common case written by Spark. This test verifies the + optimization does not regress for the normal path after the fix. + """ + instance = started_cluster_iceberg_no_spark.instances["node1"] + table_name = "test_opt_on_full_stats_" + get_uuid_str() + container_base = ( + f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}" + ) + + # Full stats for field_id 1 (id) and 2 (data). + extra = { + "column_sizes": [ + {"key": 1, "value": 64}, + {"key": 2, "value": 128}, + ], + "value_counts": [ + {"key": 1, "value": 3}, + {"key": 2, "value": 3}, + ], + "null_value_counts": [ + {"key": 1, "value": 0}, + {"key": 2, "value": 0}, + ], + } + + with tempfile.TemporaryDirectory() as tmpdir: + local_dir = _create_iceberg_table_with_schema( + tmpdir, table_name, container_base, + _MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR, + manifest_extra_fields=extra, + ) + _upload_to_container(instance, local_dir, container_base) + + result = instance.query( + f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)" + " ORDER BY id" + " SETTINGS allow_experimental_iceberg_read_optimization=1" + ).strip() + + assert result == "1\thello\n2\tworld\n3\ticeberg", ( + f"Got: {result!r}\n" + "Full-stats manifest should return real data when optimization is enabled." + ) From 056eb1a3ec8926a0a1a9541cf767571b4e9219b5 Mon Sep 17 00:00:00 2001 From: "Daniel Q. Kim" Date: Mon, 22 Jun 2026 14:55:28 +0200 Subject: [PATCH 02/33] Rework Iceberg empty-stats tests per review feedback Address @zvonand's review of #1895: the tests were poorly scoped. The C++ fix (gate the optimization's absent-NULL loop on `!columns_info.empty()`) is unchanged; only the integration tests are reworked. - Drop the `allow_experimental_iceberg_read_optimization=0` test. The fix only changes behavior when the optimization is enabled, so the disabled path is unrelated to this change and belongs in a separate PR if at all. - Add `test_stats_less_manifest_schema_evolution_absent_column`, a discriminating test where the Iceberg schema declares `id`+`data` but the Parquet file was written under the older schema and physically contains only `id`. On the fixed build the stats-less manifest falls through to the Parquet reader, reads `id` for real and synthesizes the absent `data` column as NULL via `IcebergMetadata::getInitialSchemaByPath` (the file's own schema becomes `initial_header`). On the unpatched build `id` is NULL too, so the result discriminates. - Add `test_stats_less_manifest_cluster_returns_real_data`, exercising the cluster JSON serialization path with `icebergLocalCluster`. An empty `columns_info` must round-trip to the workers and still read real data, confirming the bug also affected the `*Cluster` variants. - Add `test_non_empty_stats_absent_column_still_null`, a non-regression guard that the optimization's legitimate absent-NULL path (a column absent from a non-empty `columns_info`) keeps returning NULL while present columns read for real. - Merge the partial-stats and full-stats cases into one parametrized `test_non_empty_stats_returns_real_data`. - Keep the Iceberg fixtures generated in code rather than checked in as static files (zvonand's readability comment). Each manifest entry and the `metadata.json` embed per-test runtime values -- the UUID-suffixed container path, the byte sizes of the freshly written Parquet/Avro files and a random `table-uuid` -- so they are templates filled at run time, not static blobs. The one static part, the Avro schemas, is deduplicated into module constants, and a module-level comment explains the choice. Verified on the fixed build: all 6 cases pass. With the guard reverted to run the loop unconditionally, the three stats-less tests fail with all-NULL rows while the two non-empty-stats guards pass. Signed-off-by: Daniel Q. Kim Co-Authored-By: Claude Opus 4.8 --- ...t_iceberg_read_optimization_empty_stats.py | 895 ++++++++---------- 1 file changed, 380 insertions(+), 515 deletions(-) diff --git a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py index 34dc2fda2918..fe0c8e6d085f 100644 --- a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py +++ b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py @@ -1,31 +1,48 @@ #!/usr/bin/env python3 """ -Reproducer for Altinity/ClickHouse#1545: -icebergLocal() returns all-NULL columns when allow_experimental_iceberg_read_optimization=1 -and the Iceberg manifest has no column statistics. +Regression coverage for Altinity/ClickHouse#1545: +the Iceberg read optimization (``allow_experimental_iceberg_read_optimization=1``) +returned all-NULL columns when the manifest carried no per-file column statistics. Root cause ---------- -The manifest Avro writer schema intentionally omits the optional stats fields -(value_counts, column_sizes, null_value_counts, lower_bounds, upper_bounds). -AvroForIcebergDeserializer.hasPath() therefore returns false for all three stats -paths in ManifestFile.cpp. columns_infos is left empty. - -StorageObjectStorageSource::createReader's second loop (the schema-evolution -"absent column as NULL" path) iterates over every requested column. Because -columns_infos is empty, every nullable column is absent → each is injected as a -constant NULL. With all requested columns constant, need_only_count is set to -true and the Parquet file is read in count-only mode; the output is the correct -number of rows, but every value is NULL. - -Tests ------ -- test_iceberg_local_returns_actual_rows_with_stats_less_manifest - FAILS on unpatched code: returns all-NULL rows instead of real data. - -- test_iceberg_local_returns_correct_rows_when_optimization_disabled - PASSES on unpatched code: regression guard — optimization=0 bypasses the - buggy path and reads the Parquet file normally. +When a manifest's optional stats fields (``value_counts``, ``column_sizes``, +``null_value_counts``, ``lower_bounds``, ``upper_bounds``) are all absent -- +common for non-Spark writers such as pyiceberg with default settings -- +``DataFileMetaInfo::columns_info`` is left empty. The optimization's +absent-column loop in ``StorageObjectStorageSource::createReader`` then iterated +every requested column, found none in the empty ``columns_info`` map, and +injected each nullable column as a constant ``NULL``. With every requested +column constant, ``need_only_count`` became true and the Parquet file was read +in count-only mode: correct row count, every value ``NULL``. + +The fix gates that loop on ``!columns_info.empty()``; an empty map now falls +through to the Parquet reader, which reads physically-present columns normally +and synthesizes schema-evolved-absent columns as ``NULL`` via +``IcebergMetadata::getInitialSchemaByPath``. + +Test taxonomy +------------- +Discriminating (fail on the unpatched build, pass on the fixed build): + - test_stats_less_manifest_returns_real_data + - test_stats_less_manifest_schema_evolution_absent_column +Cluster/serialization path (the cluster JSON round-trip of an empty +``columns_info``): + - test_stats_less_manifest_cluster_returns_real_data +Non-regression guards (pass on both builds; the optimization's legitimate +absent-NULL and present-column behavior must keep working when stats exist): + - test_non_empty_stats_absent_column_still_null + - test_non_empty_stats_returns_real_data + +Why the Iceberg fixtures are generated in code rather than checked in as static +files: every manifest entry and the table ``metadata.json`` embed per-test +runtime values -- the UUID-suffixed container path, the byte sizes of the +freshly written Parquet and Avro files, and a random ``table-uuid`` -- so they +are templates filled at run time, not the static blobs that a checked-in fixture +(e.g. ``tests/queries/0_stateless/data_minio/.../v1.metadata.json``) would be. +The one genuinely static part, the Avro schemas, is deduplicated into the +module-level constants below; the stats-less manifest entry -- the artifact +under test -- is necessarily produced in code. """ import json @@ -39,11 +56,20 @@ import avro.schema import pyarrow as pa import pyarrow.parquet as pq +import pytest from helpers.iceberg_utils import get_uuid_str from helpers.s3_tools import LocalUploader +# The three rows every fixture stores; ``data`` is omitted from the Parquet file +# when a test needs a column the file physically lacks. +_IDS = [1, 2, 3] +_DATA = ["hello", "world", "iceberg"] +_REAL_DATA_RESULT = "1\thello\n2\tworld\n3\ticeberg" +_DATA_NULL_RESULT = "1\t\\N\n2\t\\N\n3\t\\N" + + # Iceberg v2 manifest list Avro schema (minimal fields needed by ClickHouse). _MANIFEST_LIST_SCHEMA_STR = json.dumps({ "type": "record", @@ -65,46 +91,84 @@ ], }) -# Manifest entry Avro schema that deliberately omits all per-column stats fields: -# column_sizes, value_counts, null_value_counts, lower_bounds, upper_bounds. -# -# Absent fields → AvroForIcebergDeserializer.hasPath() returns false for each → -# ManifestFile.cpp skips stats collection → columns_infos is empty → -# the optimization treats every nullable column as a "schema-evolution absent column" -# and injects a constant NULL. -_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = json.dumps({ - "type": "record", - "name": "manifest_entry", - "fields": [ - {"name": "status", "type": "int"}, - {"name": "snapshot_id", "type": ["null", "long"]}, - {"name": "sequence_number", "type": ["null", "long"]}, - {"name": "file_sequence_number", "type": ["null", "long"]}, - { - "name": "data_file", + +def _data_file_fields(stats_fields): + """data_file record fields, optionally followed by the given stats arrays.""" + fields = [ + {"name": "content", "type": "int"}, + {"name": "file_path", "type": "string"}, + {"name": "file_format", "type": "string"}, + {"name": "partition", "type": {"type": "record", "name": "r102", "fields": []}}, + {"name": "record_count", "type": "long"}, + {"name": "file_size_in_bytes", "type": "long"}, + ] + for i, name in enumerate(stats_fields): + fields.append({ + "name": name, "type": { - "type": "record", - "name": "r2", - "fields": [ - {"name": "content", "type": "int"}, - {"name": "file_path", "type": "string"}, - {"name": "file_format", "type": "string"}, - { - "name": "partition", - "type": {"type": "record", "name": "r102", "fields": []}, - }, - {"name": "record_count", "type": "long"}, - {"name": "file_size_in_bytes", "type": "long"}, - ], + "type": "array", + "items": { + "type": "record", + "name": f"k81v81_{i}", + "fields": [ + {"name": "key", "type": "int"}, + {"name": "value", "type": "long"}, + ], + }, }, - }, - ], -}) + }) + return fields -def _write_avro(schema, records, path, metadata=None): +def _manifest_entry_schema(stats_fields): + """Iceberg v2 manifest entry Avro schema carrying the given stats arrays.""" + return json.dumps({ + "type": "record", + "name": "manifest_entry", + "fields": [ + {"name": "status", "type": "int"}, + {"name": "snapshot_id", "type": ["null", "long"]}, + {"name": "sequence_number", "type": ["null", "long"]}, + {"name": "file_sequence_number", "type": ["null", "long"]}, + { + "name": "data_file", + "type": { + "type": "record", + "name": "r2", + "fields": _data_file_fields(stats_fields), + }, + }, + ], + }) + + +# Stats-less manifest: omits every optional stats field -> columns_info is empty. +_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = _manifest_entry_schema([]) +# Partial stats: value_counts only (column_sizes and null_value_counts absent); +# enough to populate columns_info so the empty-stats guard is not triggered. +_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR = _manifest_entry_schema(["value_counts"]) +# Full stats: the common Spark-written shape. +_MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR = _manifest_entry_schema( + ["column_sizes", "value_counts", "null_value_counts"] +) + + +def _schema_fields(with_data): + fields = [{"id": 1, "name": "id", "required": False, "type": "int"}] + if with_data: + fields.append({"id": 2, "name": "data", "required": False, "type": "string"}) + return fields + + +def _iceberg_schema(schema_id, with_data): + return {"type": "struct", "schema-id": schema_id, "fields": _schema_fields(with_data)} + + +def _write_avro(schema_str, records, path, metadata=None): with open(path, "wb") as f: - writer = avro.datafile.DataFileWriter(f, avro.io.DatumWriter(), schema) + writer = avro.datafile.DataFileWriter( + f, avro.io.DatumWriter(), avro.schema.parse(schema_str) + ) if metadata: for k, v in metadata.items(): writer.set_meta(k, v if isinstance(v, bytes) else v.encode("utf-8")) @@ -113,17 +177,32 @@ def _write_avro(schema, records, path, metadata=None): writer.close() -def _create_stats_less_iceberg_table(tmpdir, table_name, container_base): +def _create_iceberg_table( + tmpdir, + table_name, + container_base, + *, + manifest_entry_schema_str, + file_has_data_column=True, + data_file_stats=None, + schema_evolution=False, +): """ - Build a minimal Iceberg v2 table under tmpdir/table_name. The manifest is - written with a stats-less Avro schema to trigger the optimization bug. + Build a minimal Iceberg v2 table under ``tmpdir/table_name``. container_base is the absolute path inside the ClickHouse container where the - table will be placed after upload. File paths embedded in the Avro records and - metadata.json must reference container_base because they are interpreted at - query time inside the container. - - Returns the local table directory path (tmpdir/table_name). + table is placed after upload; file paths embedded in the Avro records and + metadata.json must reference it because they are interpreted at query time + inside the container. + + file_has_data_column controls whether the Parquet file physically contains + the ``data`` column. data_file_stats, when given, supplies the manifest + entry's stats arrays (e.g. ``value_counts``). schema_evolution lays the + table out with two schemas (0: ``id`` only, 1: ``id``+``data``) and two + snapshots so the data file resolves to the older schema id, exercising the + Parquet-reader fall-through that synthesizes the absent ``data`` column. + + Returns the local table directory path. """ table_local = os.path.join(tmpdir, table_name) data_local = os.path.join(table_local, "data") @@ -131,125 +210,137 @@ def _create_stats_less_iceberg_table(tmpdir, table_name, container_base): os.makedirs(data_local) os.makedirs(meta_local) - # Parquet data file with two nullable columns. - arrow_schema = pa.schema([ - pa.field("id", pa.int32(), nullable=True), - pa.field("data", pa.string(), nullable=True), - ]) + arrow_fields = [pa.field("id", pa.int32(), nullable=True)] + table_data = {"id": pa.array(_IDS, type=pa.int32())} + if file_has_data_column: + arrow_fields.append(pa.field("data", pa.string(), nullable=True)) + table_data["data"] = pa.array(_DATA, type=pa.string()) pq.write_table( - pa.table( - { - "id": pa.array([1, 2, 3], type=pa.int32()), - "data": pa.array(["hello", "world", "iceberg"], type=pa.string()), - }, - schema=arrow_schema, - ), + pa.table(table_data, schema=pa.schema(arrow_fields)), os.path.join(data_local, "00000-0-data.parquet"), ) data_size = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet")) data_container_path = f"{container_base}/data/00000-0-data.parquet" - snapshot_id = 1 - seq_number = 1 - ts_ms = int(time.time() * 1000) + ts_ms = int(time.time() * 1000) - # Iceberg schema and partition-spec to embed as Avro file-level metadata. - # ManifestFile.cpp requires both keys to be present in the manifest's Avro header. - iceberg_schema = { - "type": "struct", - "schema-id": 0, - "fields": [ - {"id": 1, "name": "id", "required": False, "type": "int"}, - {"id": 2, "name": "data", "required": False, "type": "string"}, - ], + # The manifest entry's snapshot_id is the snapshot that wrote the data file. + # Under schema evolution that snapshot (id 1) carries the older schema (0), + # so the iterator resolves the file to schema 0 while reading the table at + # schema 1 -- the condition that makes getInitialSchemaByPath return the + # file's own schema as initial_header. + entry_snapshot_id = 1 + write_schema_id = 0 # schema the data file was written with + + data_file_record = { + "content": 0, # DATA + "file_path": data_container_path, + "file_format": "PARQUET", + "partition": {}, + "record_count": len(_IDS), + "file_size_in_bytes": data_size, } + if data_file_stats: + data_file_record.update(data_file_stats) - # Manifest file (stats-less Avro schema). manifest_local_path = os.path.join(meta_local, "00000-0-manifest.avro") manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro" _write_avro( - avro.schema.parse(_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR), + manifest_entry_schema_str, [{ "status": 1, # ADDED - "snapshot_id": snapshot_id, - "sequence_number": seq_number, - "file_sequence_number": seq_number, - "data_file": { - "content": 0, # DATA - "file_path": data_container_path, - "file_format": "PARQUET", - "partition": {}, - "record_count": 3, - "file_size_in_bytes": data_size, - }, + "snapshot_id": entry_snapshot_id, + "sequence_number": 1, + "file_sequence_number": 1, + "data_file": data_file_record, }], manifest_local_path, metadata={ - "schema": json.dumps(iceberg_schema), + # The manifest's own schema is the data file's write schema. + "schema": json.dumps(_iceberg_schema(write_schema_id, file_has_data_column)), "partition-spec": "[]", }, ) manifest_size = os.path.getsize(manifest_local_path) - # Manifest list. - mlist_filename = f"snap-{snapshot_id}-0-manifest-list.avro" - mlist_local_path = os.path.join(meta_local, mlist_filename) - mlist_container_path = f"{container_base}/metadata/{mlist_filename}" - _write_avro( - avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR), - [{ - "manifest_path": manifest_container_path, - "manifest_length": manifest_size, - "partition_spec_id": 0, - "content": 0, # DATA - "sequence_number": seq_number, - "min_sequence_number": seq_number, - "added_snapshot_id": snapshot_id, - "added_files_count": 1, - "existing_files_count": 0, - "deleted_files_count": 0, - "added_rows_count": 3, - "existing_rows_count": 0, - "deleted_rows_count": 0, - }], - mlist_local_path, - ) + def _write_manifest_list(snapshot_id): + filename = f"snap-{snapshot_id}-0-manifest-list.avro" + local_path = os.path.join(meta_local, filename) + container_path = f"{container_base}/metadata/{filename}" + _write_avro( + _MANIFEST_LIST_SCHEMA_STR, + [{ + "manifest_path": manifest_container_path, + "manifest_length": manifest_size, + "partition_spec_id": 0, + "content": 0, # DATA + "sequence_number": 1, + "min_sequence_number": 1, + "added_snapshot_id": entry_snapshot_id, + "added_files_count": 1, + "existing_files_count": 0, + "deleted_files_count": 0, + "added_rows_count": len(_IDS), + "existing_rows_count": 0, + "deleted_rows_count": 0, + }], + local_path, + ) + return container_path + + if schema_evolution: + # Two schemas, two snapshots: the data file was written under schema 0 + # (id only); the table later evolved to schema 1 (id + data). + schemas = [_iceberg_schema(0, with_data=False), _iceberg_schema(1, with_data=True)] + current_schema = 1 + last_column_id = 2 + snapshots = [ + {"snapshot-id": 1, "schema-id": 0, "manifest-list": _write_manifest_list(1)}, + {"snapshot-id": 2, "schema-id": 1, "manifest-list": _write_manifest_list(2)}, + ] + current_snapshot = 2 + last_seq = 2 + else: + # Single schema (id + data); the data file resolves to it directly. + schemas = [_iceberg_schema(0, with_data=True)] + current_schema = 0 + last_column_id = 2 + snapshots = [ + {"snapshot-id": 1, "schema-id": 0, "manifest-list": _write_manifest_list(1)}, + ] + current_snapshot = 1 + last_seq = 1 - # Table metadata JSON. metadata = { "format-version": 2, "table-uuid": str(uuid.uuid4()), "location": container_base, - "last-sequence-number": seq_number, + "last-sequence-number": last_seq, "last-updated-ms": ts_ms, - "last-column-id": 2, - "current-schema-id": 0, - "schemas": [{ - "type": "struct", - "schema-id": 0, - "fields": [ - {"id": 1, "name": "id", "required": False, "type": "int"}, - {"id": 2, "name": "data", "required": False, "type": "string"}, - ], - }], + "last-column-id": last_column_id, + "current-schema-id": current_schema, + "schemas": schemas, "default-spec-id": 0, "partition-specs": [{"spec-id": 0, "fields": []}], "last-partition-id": 999, "default-sort-order-id": 0, "sort-orders": [{"order-id": 0, "fields": []}], "properties": {}, - "current-snapshot-id": snapshot_id, - "snapshots": [{ - "snapshot-id": snapshot_id, - "sequence-number": seq_number, - "timestamp-ms": ts_ms, - "manifest-list": mlist_container_path, - "summary": {"operation": "append"}, - "schema-id": 0, - }], - "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}], + "current-snapshot-id": current_snapshot, + "snapshots": [ + { + "snapshot-id": s["snapshot-id"], + "sequence-number": s["snapshot-id"], + "timestamp-ms": ts_ms, + "manifest-list": s["manifest-list"], + "summary": {"operation": "append"}, + "schema-id": s["schema-id"], + } + for s in snapshots + ], + "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": s["snapshot-id"]} for s in snapshots], "metadata-log": [], - "refs": {"main": {"snapshot-id": snapshot_id, "type": "branch"}}, + "refs": {"main": {"snapshot-id": current_snapshot, "type": "branch"}}, } with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f: json.dump(metadata, f, indent=2) @@ -257,7 +348,7 @@ def _create_stats_less_iceberg_table(tmpdir, table_name, container_base): return table_local -def _upload_to_container(instance, local_table_dir, container_base): +def _upload_to_node(instance, local_table_dir, container_base): uploader = LocalUploader(instance) for root, _dirs, files in os.walk(local_table_dir): for fname in files: @@ -266,420 +357,194 @@ def _upload_to_container(instance, local_table_dir, container_base): uploader.upload_file(local_path, os.path.join(container_base, rel)) -# Manifest entry schema with value_counts only (partial stats). -# column_sizes and null_value_counts are absent. value_counts alone is enough to -# populate columns_info so the absent-NULL guard (columns_info.empty()) is not triggered. -_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR = json.dumps({ - "type": "record", - "name": "manifest_entry", - "fields": [ - {"name": "status", "type": "int"}, - {"name": "snapshot_id", "type": ["null", "long"]}, - {"name": "sequence_number", "type": ["null", "long"]}, - {"name": "file_sequence_number", "type": ["null", "long"]}, - { - "name": "data_file", - "type": { - "type": "record", - "name": "r2", - "fields": [ - {"name": "content", "type": "int"}, - {"name": "file_path", "type": "string"}, - {"name": "file_format", "type": "string"}, - { - "name": "partition", - "type": {"type": "record", "name": "r102", "fields": []}, - }, - {"name": "record_count", "type": "long"}, - {"name": "file_size_in_bytes", "type": "long"}, - # Only value_counts is present (column_sizes and null_value_counts absent). - { - "name": "value_counts", - "type": { - "type": "array", - "items": { - "type": "record", - "name": "k81v81", - "fields": [ - {"name": "key", "type": "int"}, - {"name": "value", "type": "long"}, - ], - }, - }, - }, - ], - }, - }, - ], -}) - -# Manifest entry schema with all three stats fields (value_counts, column_sizes, -# null_value_counts). Represents a manifest written by a full-stats writer such as Spark. -_MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR = json.dumps({ - "type": "record", - "name": "manifest_entry", - "fields": [ - {"name": "status", "type": "int"}, - {"name": "snapshot_id", "type": ["null", "long"]}, - {"name": "sequence_number", "type": ["null", "long"]}, - {"name": "file_sequence_number", "type": ["null", "long"]}, - { - "name": "data_file", - "type": { - "type": "record", - "name": "r2", - "fields": [ - {"name": "content", "type": "int"}, - {"name": "file_path", "type": "string"}, - {"name": "file_format", "type": "string"}, - { - "name": "partition", - "type": {"type": "record", "name": "r102", "fields": []}, - }, - {"name": "record_count", "type": "long"}, - {"name": "file_size_in_bytes", "type": "long"}, - { - "name": "column_sizes", - "type": { - "type": "array", - "items": { - "type": "record", - "name": "k81v81", - "fields": [ - {"name": "key", "type": "int"}, - {"name": "value", "type": "long"}, - ], - }, - }, - }, - { - "name": "value_counts", - "type": { - "type": "array", - "items": { - "type": "record", - "name": "k81v81_vc", - "fields": [ - {"name": "key", "type": "int"}, - {"name": "value", "type": "long"}, - ], - }, - }, - }, - { - "name": "null_value_counts", - "type": { - "type": "array", - "items": { - "type": "record", - "name": "k81v81_nc", - "fields": [ - {"name": "key", "type": "int"}, - {"name": "value", "type": "long"}, - ], - }, - }, - }, - ], - }, - }, - ], -}) +def _container_base(table_name): + return f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}" -def _create_iceberg_table_with_schema(tmpdir, table_name, container_base, - manifest_schema_str, manifest_extra_fields=None): - """ - Build a minimal Iceberg v2 table. manifest_extra_fields is added to each - manifest entry's data_file record when using schemas that include stats. - """ - table_local = os.path.join(tmpdir, table_name) - data_local = os.path.join(table_local, "data") - meta_local = os.path.join(table_local, "metadata") - os.makedirs(data_local) - os.makedirs(meta_local) - - arrow_schema = pa.schema([ - pa.field("id", pa.int32(), nullable=True), - pa.field("data", pa.string(), nullable=True), - ]) - pq.write_table( - pa.table( - { - "id": pa.array([1, 2, 3], type=pa.int32()), - "data": pa.array(["hello", "world", "iceberg"], type=pa.string()), - }, - schema=arrow_schema, - ), - os.path.join(data_local, "00000-0-data.parquet"), - ) - data_size = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet")) - data_container_path = f"{container_base}/data/00000-0-data.parquet" +def _query_local(instance, container_base, optimization=1): + return instance.query( + f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)" + " ORDER BY id" + f" SETTINGS allow_experimental_iceberg_read_optimization={optimization}" + ).strip() - snapshot_id = 1 - seq_number = 1 - ts_ms = int(time.time() * 1000) - iceberg_schema = { - "type": "struct", - "schema-id": 0, - "fields": [ - {"id": 1, "name": "id", "required": False, "type": "int"}, - {"id": 2, "name": "data", "required": False, "type": "string"}, - ], - } +def test_stats_less_manifest_returns_real_data(started_cluster_iceberg_no_spark): + """ + Discriminating reproducer for Altinity#1545. - data_file_record = { - "content": 0, - "file_path": data_container_path, - "file_format": "PARQUET", - "partition": {}, - "record_count": 3, - "file_size_in_bytes": data_size, - } - if manifest_extra_fields: - data_file_record.update(manifest_extra_fields) + A stats-less manifest (empty columns_info) must read real data, not all-NULL + rows. Unpatched build: every nullable column is injected as a constant NULL + and the result is 3 all-NULL rows. Fixed build: the empty-stats guard falls + through to the Parquet reader and returns the real values. + """ + instance = started_cluster_iceberg_no_spark.instances["node1"] + table_name = "test_stats_less_real_" + get_uuid_str() + container_base = _container_base(table_name) - manifest_local_path = os.path.join(meta_local, "00000-0-manifest.avro") - manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro" - _write_avro( - avro.schema.parse(manifest_schema_str), - [{ - "status": 1, - "snapshot_id": snapshot_id, - "sequence_number": seq_number, - "file_sequence_number": seq_number, - "data_file": data_file_record, - }], - manifest_local_path, - metadata={ - "schema": json.dumps(iceberg_schema), - "partition-spec": "[]", - }, - ) - manifest_size = os.path.getsize(manifest_local_path) + with tempfile.TemporaryDirectory() as tmpdir: + local_dir = _create_iceberg_table( + tmpdir, table_name, container_base, + manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR, + ) + _upload_to_node(instance, local_dir, container_base) - mlist_filename = f"snap-{snapshot_id}-0-manifest-list.avro" - mlist_local_path = os.path.join(meta_local, mlist_filename) - mlist_container_path = f"{container_base}/metadata/{mlist_filename}" - _write_avro( - avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR), - [{ - "manifest_path": manifest_container_path, - "manifest_length": manifest_size, - "partition_spec_id": 0, - "content": 0, - "sequence_number": seq_number, - "min_sequence_number": seq_number, - "added_snapshot_id": snapshot_id, - "added_files_count": 1, - "existing_files_count": 0, - "deleted_files_count": 0, - "added_rows_count": 3, - "existing_rows_count": 0, - "deleted_rows_count": 0, - }], - mlist_local_path, + result = _query_local(instance, container_base) + assert result == _REAL_DATA_RESULT, ( + f"Got: {result!r}\n" + "All-NULL rows mean the bug fired: empty columns_info caused the " + "optimization to treat every nullable column as absent and inject " + "constant NULL (need_only_count=1)." ) - metadata = { - "format-version": 2, - "table-uuid": str(uuid.uuid4()), - "location": container_base, - "last-sequence-number": seq_number, - "last-updated-ms": ts_ms, - "last-column-id": 2, - "current-schema-id": 0, - "schemas": [{ - "type": "struct", - "schema-id": 0, - "fields": [ - {"id": 1, "name": "id", "required": False, "type": "int"}, - {"id": 2, "name": "data", "required": False, "type": "string"}, - ], - }], - "default-spec-id": 0, - "partition-specs": [{"spec-id": 0, "fields": []}], - "last-partition-id": 999, - "default-sort-order-id": 0, - "sort-orders": [{"order-id": 0, "fields": []}], - "properties": {}, - "current-snapshot-id": snapshot_id, - "snapshots": [{ - "snapshot-id": snapshot_id, - "sequence-number": seq_number, - "timestamp-ms": ts_ms, - "manifest-list": mlist_container_path, - "summary": {"operation": "append"}, - "schema-id": 0, - }], - "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}], - "metadata-log": [], - "refs": {"main": {"snapshot-id": snapshot_id, "type": "branch"}}, - } - with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f: - json.dump(metadata, f, indent=2) - - return table_local - -def test_iceberg_local_returns_actual_rows_with_stats_less_manifest( - started_cluster_iceberg_no_spark, -): +def test_stats_less_manifest_schema_evolution_absent_column(started_cluster_iceberg_no_spark): """ - FAILS on unpatched code (Altinity#1545). + Discriminating test for the schema-evolution fall-through (highest risk). + + The Iceberg schema declares ``id``+``data`` but the Parquet file was written + under the older schema and physically contains only ``id``. With the + stats-less manifest, the fixed build must fall through to the Parquet reader, + read ``id`` for real, and synthesize the absent ``data`` column as NULL via + IcebergMetadata::getInitialSchemaByPath (file schema as initial_header). - With allow_experimental_iceberg_read_optimization=1 (the default), reading an - icebergLocal table whose manifest omits column statistics returns all-NULL rows - instead of real data. The correct output is 3 rows: (1,'hello'), (2,'world'), - (3,'iceberg'). + Unpatched build: the optimization marks BOTH columns (including ``id``) + constant NULL, so ``id`` comes back NULL too -- the discriminator. """ - instance = started_cluster_iceberg_no_spark.instances["node1"] - table_name = "test_opt_on_stats_less_" + get_uuid_str() - container_base = ( - f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}" - ) + instance = started_cluster_iceberg_no_spark.instances["node1"] + table_name = "test_stats_less_evolve_" + get_uuid_str() + container_base = _container_base(table_name) with tempfile.TemporaryDirectory() as tmpdir: - local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base) - _upload_to_container(instance, local_dir, container_base) - - result = instance.query( - f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)" - " ORDER BY id" - " SETTINGS allow_experimental_iceberg_read_optimization=1" - ).strip() + local_dir = _create_iceberg_table( + tmpdir, table_name, container_base, + manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR, + file_has_data_column=False, + schema_evolution=True, + ) + _upload_to_node(instance, local_dir, container_base) - assert result == "1\thello\n2\tworld\n3\ticeberg", ( + result = _query_local(instance, container_base) + assert result == _DATA_NULL_RESULT, ( f"Got: {result!r}\n" - "All-NULL rows means the bug fired: empty columns_infos caused the " - "optimization to treat every nullable column as an absent schema-evolution " - "column and inject constant NULL (need_only_count=1)." + "Expected real 'id' with NULL 'data'. If 'id' is also NULL the bug " + "fired; if the query errors, the Parquet fall-through did not synthesize " + "the schema-evolved-absent column as NULL." ) -def test_iceberg_local_returns_correct_rows_when_optimization_disabled( - started_cluster_iceberg_no_spark, -): +def test_stats_less_manifest_cluster_returns_real_data(started_cluster_iceberg_no_spark): """ - PASSES on unpatched code. - - With allow_experimental_iceberg_read_optimization=0 the optimization block is - skipped entirely and the Parquet file is read normally. This test is a - regression guard: it must keep passing after the bug is fixed. + Cluster/serialization-path coverage for Altinity#1545. + + icebergLocalCluster distributes file-level tasks to workers, serializing + DataFileMetaInfo (and its empty columns_info) through the cluster JSON path. + An empty columns_info must survive the round-trip and still read real data on + the worker, exercising the same fix on the *Cluster variants. The table is + uploaded to every node because local storage is read by each worker from its + own filesystem. """ - instance = started_cluster_iceberg_no_spark.instances["node1"] - table_name = "test_opt_off_stats_less_" + get_uuid_str() - container_base = ( - f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}" - ) + cluster = started_cluster_iceberg_no_spark + instance = cluster.instances["node1"] + table_name = "test_stats_less_cluster_" + get_uuid_str() + container_base = _container_base(table_name) with tempfile.TemporaryDirectory() as tmpdir: - local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base) - _upload_to_container(instance, local_dir, container_base) + local_dir = _create_iceberg_table( + tmpdir, table_name, container_base, + manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR, + ) + for node in cluster.instances.values(): + _upload_to_node(node, local_dir, container_base) result = instance.query( - f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)" + f"SELECT * FROM icebergLocalCluster('cluster_simple', local, path='{container_base}', format=Parquet)" " ORDER BY id" - " SETTINGS allow_experimental_iceberg_read_optimization=0" + " SETTINGS allow_experimental_iceberg_read_optimization=1" ).strip() - - assert result == "1\thello\n2\tworld\n3\ticeberg" + assert result == _REAL_DATA_RESULT, ( + f"Got: {result!r}\n" + "All-NULL rows mean empty columns_info was mishandled on the worker after " + "the cluster JSON round-trip." + ) -def test_iceberg_local_partial_stats_manifest_reads_correctly( - started_cluster_iceberg_no_spark, -): +def test_non_empty_stats_absent_column_still_null(started_cluster_iceberg_no_spark): """ - Regression test for Altinity#1545: partial stats (value_counts only). + Non-regression guard: the optimization's legitimate absent-NULL path. - A manifest that includes value_counts but omits column_sizes and - null_value_counts still populates columns_info. The fix must treat the - columns as real (not absent), so real data is returned. + The Iceberg schema declares ``id``+``data`` but the Parquet file contains + only ``id``, and the manifest carries stats for ``id`` only (non-empty + columns_info). The optimization must keep marking the genuinely-absent + ``data`` column NULL while ``id`` reads real values. Passes on both builds; + the fix only changes the empty-columns_info case. """ - instance = started_cluster_iceberg_no_spark.instances["node1"] - table_name = "test_opt_on_partial_stats_" + get_uuid_str() - container_base = ( - f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}" - ) + instance = started_cluster_iceberg_no_spark.instances["node1"] + table_name = "test_nonempty_absent_" + get_uuid_str() + container_base = _container_base(table_name) - # value_counts for field_id 1 (id) and 2 (data): 3 rows each. - extra = { - "value_counts": [ - {"key": 1, "value": 3}, - {"key": 2, "value": 3}, - ], - } + # value_counts for field_id 1 (id) only -> columns_info = {id}. + stats = {"value_counts": [{"key": 1, "value": len(_IDS)}]} with tempfile.TemporaryDirectory() as tmpdir: - local_dir = _create_iceberg_table_with_schema( + local_dir = _create_iceberg_table( tmpdir, table_name, container_base, - _MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR, - manifest_extra_fields=extra, + manifest_entry_schema_str=_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR, + file_has_data_column=False, + data_file_stats=stats, ) - _upload_to_container(instance, local_dir, container_base) + _upload_to_node(instance, local_dir, container_base) - result = instance.query( - f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)" - " ORDER BY id" - " SETTINGS allow_experimental_iceberg_read_optimization=1" - ).strip() - - assert result == "1\thello\n2\tworld\n3\ticeberg", ( + result = _query_local(instance, container_base) + assert result == _DATA_NULL_RESULT, ( f"Got: {result!r}\n" - "Partial-stats manifest (value_counts only) should not trigger the absent-NULL " - "path because columns_info is non-empty. All-NULL rows means the guard " - "fired incorrectly for partial-stats manifests." + "With non-empty stats the optimization must still read 'id' and inject " + "the absent 'data' column as NULL." ) -def test_iceberg_local_full_stats_manifest_reads_correctly( - started_cluster_iceberg_no_spark, +@pytest.mark.parametrize( + "manifest_entry_schema_str,stats", + [ + ( + _MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR, + {"value_counts": [{"key": 1, "value": 3}, {"key": 2, "value": 3}]}, + ), + ( + _MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR, + { + "column_sizes": [{"key": 1, "value": 64}, {"key": 2, "value": 128}], + "value_counts": [{"key": 1, "value": 3}, {"key": 2, "value": 3}], + "null_value_counts": [{"key": 1, "value": 0}, {"key": 2, "value": 0}], + }, + ), + ], + ids=["partial_stats", "full_stats"], +) +def test_non_empty_stats_returns_real_data( + started_cluster_iceberg_no_spark, manifest_entry_schema_str, stats ): """ - Control test: full-stats manifest (Spark-like) must continue to return real data. - - A manifest with all three stats fields (column_sizes, value_counts, - null_value_counts) is the common case written by Spark. This test verifies the - optimization does not regress for the normal path after the fix. + Non-regression guard: manifests that do populate columns_info (partial + value_counts-only, and full Spark-style stats) must keep returning real data + when the optimization is enabled. These are the cases the empty-stats guard + must not disturb. """ - instance = started_cluster_iceberg_no_spark.instances["node1"] - table_name = "test_opt_on_full_stats_" + get_uuid_str() - container_base = ( - f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}" - ) - - # Full stats for field_id 1 (id) and 2 (data). - extra = { - "column_sizes": [ - {"key": 1, "value": 64}, - {"key": 2, "value": 128}, - ], - "value_counts": [ - {"key": 1, "value": 3}, - {"key": 2, "value": 3}, - ], - "null_value_counts": [ - {"key": 1, "value": 0}, - {"key": 2, "value": 0}, - ], - } + instance = started_cluster_iceberg_no_spark.instances["node1"] + table_name = "test_nonempty_real_" + get_uuid_str() + container_base = _container_base(table_name) with tempfile.TemporaryDirectory() as tmpdir: - local_dir = _create_iceberg_table_with_schema( + local_dir = _create_iceberg_table( tmpdir, table_name, container_base, - _MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR, - manifest_extra_fields=extra, + manifest_entry_schema_str=manifest_entry_schema_str, + data_file_stats=stats, ) - _upload_to_container(instance, local_dir, container_base) - - result = instance.query( - f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)" - " ORDER BY id" - " SETTINGS allow_experimental_iceberg_read_optimization=1" - ).strip() + _upload_to_node(instance, local_dir, container_base) - assert result == "1\thello\n2\tworld\n3\ticeberg", ( + result = _query_local(instance, container_base) + assert result == _REAL_DATA_RESULT, ( f"Got: {result!r}\n" - "Full-stats manifest should return real data when optimization is enabled." + "A manifest with non-empty stats must return real data when the " + "optimization is enabled." ) From 17ef6e71a7bed981355158186229316af81e1361 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 11 May 2026 16:37:40 -0300 Subject: [PATCH 03/33] cast like insert select --- src/Storages/MergeTree/ExportPartTask.cpp | 29 ++ src/Storages/MergeTree/MergeTreeData.cpp | 23 +- src/Storages/StorageReplicatedMergeTree.cpp | 23 +- .../test.py | 263 ++++++++++++++++++ ...rge_tree_part_to_object_storage_simple.sql | 9 +- ...rge_tree_part_to_object_storage_simple.sql | 4 +- 6 files changed, 337 insertions(+), 14 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartTask.cpp b/src/Storages/MergeTree/ExportPartTask.cpp index 94b25f63ad9f..f2035577bbf6 100644 --- a/src/Storages/MergeTree/ExportPartTask.cpp +++ b/src/Storages/MergeTree/ExportPartTask.cpp @@ -99,6 +99,31 @@ namespace } } + /// Mirrors `InterpreterInsertQuery::addInsertToSelectPipeline`: positional match, + /// destination header = `getSampleBlockNonMaterialized()`, all type bridging is done + /// by the CAST inside `makeConvertingActions`. No pre-validation, no per-column + /// lossy/non-lossy classification — restrictions are exactly what INSERT SELECT enforces. + void addExportConvertingActions( + QueryPlan & plan_for_part, + const IStorage & destination_storage, + const ContextPtr & local_context) + { + const auto destination_header + = destination_storage.getInMemoryMetadataPtr()->getSampleBlockNonMaterialized(); + + auto dag = ActionsDAG::makeConvertingActions( + plan_for_part.getCurrentHeader()->getColumnsWithTypeAndName(), + destination_header.getColumnsWithTypeAndName(), + ActionsDAG::MatchColumnsMode::Position, + local_context); + + auto expression_step = std::make_unique( + plan_for_part.getCurrentHeader(), + std::move(dag)); + expression_step->setStepDescription("Convert source columns to destination types for export"); + plan_for_part.addStep(std::move(expression_step)); + } + String buildDestinationFilename( const MergeTreePartExportManifest & manifest, const StorageID & storage_id, @@ -261,6 +286,10 @@ bool ExportPartTask::executeStep() /// This is a hack that materializes the columns before the export so they can be exported to tables that have matching columns materializeSpecialColumns(plan_for_part.getCurrentHeader(), metadata_snapshot, local_context, plan_for_part); + /// Align the pipeline header with the destination's non-materialized sample block, + /// using the same `makeConvertingActions(Position)` call INSERT SELECT performs. + addExportConvertingActions(plan_for_part, *destination_storage, local_context); + QueryPlanOptimizationSettings optimization_settings(local_context); auto pipeline_settings = BuildQueryPipelineSettings(local_context); auto builder = plan_for_part.buildQueryPipeline(optimization_settings, pipeline_settings); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index a07bd992066e..9dfb4edf20bc 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -6750,14 +6751,24 @@ void MergeTreeData::exportPartToTable( #endif } - const auto & source_columns = source_metadata_ptr->getColumns(); + /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` — + /// positional matching with CAST. We surface structural mismatches synchronously at + /// ALTER TABLE time by building (and discarding) the same converting DAG the worker + /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch, + /// untyped cast) fails here; anything it accepts will be CAST at runtime. + { + Block source_sample_block; + for (const auto & column : source_metadata_ptr->getColumns().getReadable()) + source_sample_block.insert({column.type->createColumn(), column.type, column.name}); - const auto & destination_columns = destination_metadata_ptr->getColumns(); + const auto destination_sample_block = destination_metadata_ptr->getSampleBlockNonMaterialized(); - /// compare all source readable columns with all destination insertable columns - /// this allows us to skip ephemeral columns - if (source_columns.getReadable().sizeOfDifference(destination_columns.getInsertable())) - throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure"); + (void) ActionsDAG::makeConvertingActions( + source_sample_block.getColumnsWithTypeAndName(), + destination_sample_block.getColumnsWithTypeAndName(), + ActionsDAG::MatchColumnsMode::Position, + query_context); + } /// for data lakes this check is performed differently. It is a bit more complex as we need to convert the iceberg partition spec /// to the MergeTree partition spec and compare the two. diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 7aaecac34832..9104884a2a91 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -8387,10 +8388,24 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & auto src_snapshot = getInMemoryMetadataPtr(); auto destination_snapshot = dest_storage->getInMemoryMetadataPtr(); - /// compare all source readable columns with all destination insertable columns - /// this allows us to skip ephemeral columns - if (src_snapshot->getColumns().getReadable().sizeOfDifference(destination_snapshot->getColumns().getInsertable())) - throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure"); + /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` — + /// positional matching with CAST. We surface structural mismatches synchronously at + /// ALTER TABLE time by building (and discarding) the same converting DAG the worker + /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch, + /// untyped cast) fails here; anything it accepts will be CAST at runtime. + { + Block source_sample_block; + for (const auto & column : src_snapshot->getColumns().getReadable()) + source_sample_block.insert({column.type->createColumn(), column.type, column.name}); + + const auto destination_sample_block = destination_snapshot->getSampleBlockNonMaterialized(); + + (void) ActionsDAG::makeConvertingActions( + source_sample_block.getColumnsWithTypeAndName(), + destination_sample_block.getColumnsWithTypeAndName(), + ActionsDAG::MatchColumnsMode::Position, + query_context); + } /// for data lakes this check is performed later. It is a bit more complex as we need to convert the iceberg partition spec /// to the MergeTree partition spec and compare the two. diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py index b371727a08cd..9dc3b2f8434e 100644 --- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py +++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py @@ -121,6 +121,42 @@ def assert_part_log(node, table: str, part: str) -> None: ) +def wait_for_failed_export_part( + node, + table: str, + part: str, + timeout: int = 60, + poll_interval: float = 0.5, +) -> str: + """Poll system.part_log until a failed ExportPart event appears for *part*. + + Returns the exception text recorded on the log entry, which lets callers + assert on the specific runtime error that propagated from the export worker. + """ + deadline = time.time() + timeout + last_seen = "" + while time.time() < deadline: + node.query("SYSTEM FLUSH LOGS") + row = node.query( + f"SELECT error, exception FROM system.part_log " + f"WHERE event_type = 'ExportPart' " + f"AND database = currentDatabase() " + f"AND table = '{table}' " + f"AND part_name = '{part}' " + f"AND error != 0 " + f"ORDER BY event_time DESC LIMIT 1" + ).strip() + if row: + _error, exception = row.split("\t", 1) + return exception + last_seen = row + time.sleep(poll_interval) + raise TimeoutError( + f"Failed ExportPart event for part {part!r} in table {table!r} " + f"did not appear in system.part_log within {timeout}s (last row: {last_seen!r})" + ) + + # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- @@ -479,3 +515,230 @@ def test_export_part_writes_column_statistics(cluster): node.query(f"DROP TABLE IF EXISTS {mt} SYNC") node.query(f"DROP TABLE IF EXISTS {iceberg}") + + +# --------------------------------------------------------------------------- +# Schema compatibility (INSERT SELECT mirror) +# +# EXPORT PART builds an ActionsDAG via makeConvertingActions(Position) — the +# same primitive that powers INSERT INTO dest SELECT * FROM src. The tests +# below pin that contract: +# +# * Column counts must match positionally; otherwise the ALTER is rejected +# synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing lands. +# * Destination column names need not match source names — positional pairs +# are CAST element-wise. +# * Castable type differences are accepted at validation time (widening and +# even lossy narrowing). Lossiness only matters at runtime: if the value +# does not fit the destination type, the async export worker fails and the +# failure surfaces via system.part_log; the Iceberg table is untouched. +# --------------------------------------------------------------------------- + + +def test_export_part_column_count_mismatch_source_more_is_rejected(cluster): + """ + Source has 3 columns (id, year, extra), destination has 2 (id, year). + The ALTER must be rejected synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH + and the Iceberg table must remain empty. + """ + node = cluster.instances["node1"] + sfx = unique_suffix() + mt = f"mt_count_more_{sfx}" + iceberg = f"iceberg_count_more_{sfx}" + + make_mt(node, mt, "id Int32, year Int32, extra String", "year") + make_iceberg_s3(node, iceberg, "id Int32, year Int32", "year") + + node.query(f"INSERT INTO {mt} VALUES (1, 2020, 'foo'), (2, 2020, 'bar')") + part_2020 = get_part(node, mt, "2020") + + error = node.query_and_get_error( + f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} " + f"SETTINGS allow_experimental_export_merge_tree_part = 1, " + f"allow_experimental_insert_into_iceberg = 1" + ) + assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, ( + f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source>dest column count, " + f"got: {error!r}" + ) + + node.query(f"DROP TABLE IF EXISTS {mt} SYNC") + node.query(f"DROP TABLE IF EXISTS {iceberg}") + + +def test_export_part_column_count_mismatch_source_fewer_is_rejected(cluster): + """ + Source has 2 columns (id, year), destination has 3 (id, year, extra). + Same expected synchronous rejection as the source>dest case. + """ + node = cluster.instances["node1"] + sfx = unique_suffix() + mt = f"mt_count_fewer_{sfx}" + iceberg = f"iceberg_count_fewer_{sfx}" + + make_mt(node, mt, "id Int32, year Int32", "year") + make_iceberg_s3(node, iceberg, "id Int32, year Int32, extra String", "year") + + node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020)") + part_2020 = get_part(node, mt, "2020") + + error = node.query_and_get_error( + f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} " + f"SETTINGS allow_experimental_export_merge_tree_part = 1, " + f"allow_experimental_insert_into_iceberg = 1" + ) + assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, ( + f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source Int32 is a + defined conversion) so the ALTER returns synchronously; the actual cast + runs in the async export worker and must fail there. The failure is + propagated to system.part_log (non-zero `error` column with a non-empty + `exception` text) and the Iceberg table is left empty. + + (Integer narrowing overflow is not used here because the internal cast + `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps + on overflow rather than throwing — that would still produce a successful + export with garbage data, not a propagated failure.) + """ + node = cluster.instances["node1"] + sfx = unique_suffix() + mt = f"mt_runtime_cast_fail_{sfx}" + iceberg = f"iceberg_runtime_cast_fail_{sfx}" + + make_mt(node, mt, "id String, year Int32", "year") + make_iceberg_s3(node, iceberg, "id Int32, year Int32", "year") + + node.query(f"INSERT INTO {mt} VALUES ('not a number', 2020)") + part_2020 = get_part(node, mt, "2020") + + export_part(node, mt, part_2020, iceberg) + + exception = wait_for_failed_export_part(node, mt, part_2020) + assert exception, ( + f"Expected non-empty exception text on failed ExportPart entry, got {exception!r}" + ) + + count = int(node.query(f"SELECT count() FROM {iceberg}").strip()) + assert count == 0, ( + f"Expected 0 rows in Iceberg table after failed export, got {count}" + ) + + node.query(f"DROP TABLE IF EXISTS {mt} SYNC") + node.query(f"DROP TABLE IF EXISTS {iceberg}") diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql index 7cb70af024a2..ceb83191f93a 100644 --- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql +++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql @@ -9,10 +9,12 @@ CREATE TABLE 03572_mt_table (id UInt64, year UInt16) ENGINE = MergeTree() PARTIT INSERT INTO 03572_mt_table VALUES (1, 2020); -- Create a table with a different partition key and export a partition to it. It should throw +-- on the partition-key AST mismatch (schema compat now follows INSERT SELECT positional semantics, +-- so the column shape matches and the partition-key check is what fires). CREATE TABLE 03572_invalid_schema_table (id UInt64, x UInt16) ENGINE = S3(s3_conn, filename='03572_invalid_schema_table', format='Parquet', partition_strategy='hive') PARTITION BY x; ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_invalid_schema_table -SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError INCOMPATIBLE_COLUMNS} +SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} DROP TABLE 03572_invalid_schema_table; @@ -27,13 +29,14 @@ ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE FUNCTION extractKey -- It is a table function, but the engine does not support exports/imports, should throw ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE FUNCTION url('a.parquet'); -- {serverError NOT_IMPLEMENTED} --- Test that destination table can not have a column that matches the source ephemeral +-- Source-side ephemeral columns are not readable, so the destination must not declare a matching +-- ordinary column or the column count will not align under positional matching. CREATE TABLE 03572_ephemeral_mt_table (id UInt64, year UInt16, name String EPHEMERAL) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple(); CREATE TABLE 03572_matching_ephemeral_s3_table (id UInt64, year UInt16, name String) ENGINE = S3(s3_conn, filename='03572_matching_ephemeral_s3_table', format='Parquet', partition_strategy='hive') PARTITION BY year; INSERT INTO 03572_ephemeral_mt_table (id, year, name) VALUES (1, 2020, 'alice'); -ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError INCOMPATIBLE_COLUMNS} +ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError NUMBER_OF_COLUMNS_DOESNT_MATCH} DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table; diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql index f8f23532f0a7..0f6647861e43 100644 --- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql +++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql @@ -7,10 +7,12 @@ CREATE TABLE 03572_rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTr INSERT INTO 03572_rmt_table VALUES (1, 2020); -- Create a table with a different partition key and export a partition to it. It should throw +-- on the partition-key AST mismatch (schema compat now follows INSERT SELECT positional semantics, +-- so the column shape matches and the partition-key check is what fires). CREATE TABLE 03572_invalid_schema_table (id UInt64, x UInt16) ENGINE = S3(s3_conn, filename='03572_invalid_schema_table', format='Parquet', partition_strategy='hive') PARTITION BY x; ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table -SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError INCOMPATIBLE_COLUMNS} +SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} DROP TABLE 03572_invalid_schema_table; From c88f69eb66f5cb5e794995d40e140c903d94da0a Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 22 May 2026 18:51:08 -0300 Subject: [PATCH 04/33] tmp --- .../MergeTree/ExportPartitionUtils.cpp | 60 +++++++++- src/Storages/MergeTree/ExportPartitionUtils.h | 35 +++++- src/Storages/MergeTree/MergeTreeData.cpp | 22 +++- src/Storages/StorageReplicatedMergeTree.cpp | 22 +++- .../test.py | 67 +++++++++++ .../test.py | 66 ++++++++++ .../test.py | 113 ++++++++++++++++++ ...rge_tree_part_to_object_storage_simple.sql | 16 ++- ...rge_tree_part_to_object_storage_simple.sql | 14 ++- 9 files changed, 400 insertions(+), 15 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp index 4a447c8de3ab..d5df9b6bea2a 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.cpp +++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp @@ -403,8 +403,18 @@ namespace ExportPartitionUtils #if USE_AVRO void verifyIcebergPartitionCompatibility( const Poco::JSON::Object::Ptr & metadata_object, - const ASTPtr & partition_key_ast) + const ASTPtr & partition_key_ast, + const StorageMetadataPtr & source_metadata, + const StorageMetadataPtr & destination_metadata, + const StorageID & destination_storage_id) { + /// The Iceberg manifest's partition values are derived from the SOURCE + /// part's minmax block (see `IcebergMetadata::commitExportPartitionTransaction`) + /// while the row data is CAST to the destination Iceberg schema by + /// `addExportConvertingActions`. Reject castable type differences on + /// partition-key columns up front so the two cannot disagree. + verifyPartitionColumnsExactTypeMatch(source_metadata, destination_metadata, destination_storage_id); + const auto original_schema_id = metadata_object->getValue(Iceberg::f_current_schema_id); const auto partition_spec_id = metadata_object->getValue(Iceberg::f_default_spec_id); @@ -519,6 +529,54 @@ namespace ExportPartitionUtils } } #endif + + void verifyPartitionColumnsExactTypeMatch( + const StorageMetadataPtr & source_metadata, + const StorageMetadataPtr & destination_metadata, + const StorageID & destination_storage_id) + { + if (!source_metadata->hasPartitionKey()) + return; + + /// Use the partition expression's required columns (the ones that flow + /// into the destination's partition strategy via + /// `minmax_idx->getBlock(storage)`), not the partition expression's + /// sample-block output columns. This catches both identity + /// partitioning (`PARTITION BY year`) and computed partitioning + /// (`PARTITION BY toYearNumSinceEpoch(event_date)` — required column + /// is `event_date`). + const auto required_columns + = source_metadata->getPartitionKey().expression->getRequiredColumnsWithTypes(); + + const auto & destination_columns = destination_metadata->getColumns(); + + for (const auto & source_column : required_columns) + { + if (!destination_columns.has(source_column.name)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Cannot export partition: source partition key references column '{}' " + "which does not exist on the destination table {}. " + "EXPORT does not re-evaluate the partition key against the destination's schema, " + "so every column the source partition expression depends on must be present " + "on the destination with the same type.", + source_column.name, destination_storage_id.getFullTableName()); + + const auto destination_type + = destination_columns.getPhysical(source_column.name).type; + if (!source_column.type->equals(*destination_type)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Cannot export partition: partition key column '{}' has type {} on the source " + "but {} on the destination table {}. " + "EXPORT does not re-evaluate the partition key against the destination's schema, " + "so castable type differences are not allowed for partition columns " + "(the partition path and any Iceberg partition values are derived from the " + "source-typed minmax block, while the row data is CAST to the destination type).", + source_column.name, + source_column.type->getName(), + destination_type->getName(), + destination_storage_id.getFullTableName()); + } + } } } diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h index eb67d288d71e..958a8a819457 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.h +++ b/src/Storages/MergeTree/ExportPartitionUtils.h @@ -7,6 +7,7 @@ #include #include #include "Storages/IStorage.h" +#include #include #if USE_AVRO @@ -88,13 +89,41 @@ namespace ExportPartitionUtils const std::string & exception_message, const LoggerPtr & log); + /// EXPORT PART/PARTITION never re-evaluates the partition key against the + /// destination's schema: the partition path (object storage) and the + /// Iceberg partition values (data lake) are derived from the SOURCE part's + /// minmax block in ExportPartTask::executeStep. Therefore every column the + /// SOURCE partition expression depends on must have an identically-typed + /// counterpart on the destination — castable but non-equal types are not + /// safe here even though `makeConvertingActions` would accept them. + /// + /// Called directly from the non-data-lake branch of + /// `MergeTreeData::exportPartToTable` / + /// `StorageReplicatedMergeTree::exportPartitionToTable`, and indirectly + /// for the Iceberg branch via `verifyIcebergPartitionCompatibility` which + /// embeds this check. Throws BAD_ARGUMENTS on missing column or type + /// mismatch. + void verifyPartitionColumnsExactTypeMatch( + const StorageMetadataPtr & source_metadata, + const StorageMetadataPtr & destination_metadata, + const StorageID & destination_storage_id); + #if USE_AVRO /// Verifies that the source MergeTree partition key is compatible with the - /// destination Iceberg partition spec by comparing field source-ids and - /// transforms in order. Throws BAD_ARGUMENTS if they do not match. + /// destination Iceberg partition spec by: + /// * Comparing partition-field source-ids and transforms in order + /// against the Iceberg spec derived from the MergeTree PARTITION BY AST. + /// * Embedding `verifyPartitionColumnsExactTypeMatch` so that columns + /// flowing through the partition key have identical ClickHouse types on + /// both sides — the Iceberg manifest's partition values come from the + /// SOURCE-typed minmax block and cannot be reconciled later. + /// Throws BAD_ARGUMENTS on any mismatch. void verifyIcebergPartitionCompatibility( const Poco::JSON::Object::Ptr & metadata_object, - const ASTPtr & partition_key_ast); + const ASTPtr & partition_key_ast, + const StorageMetadataPtr & source_metadata, + const StorageMetadataPtr & destination_metadata, + const StorageID & destination_storage_id); #endif } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 9dfb4edf20bc..ee2b7d2250c8 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6743,7 +6743,12 @@ void MergeTreeData::exportPartToTable( metadata_object->stringify(oss); iceberg_metadata_json = oss.str(); - ExportPartitionUtils::verifyIcebergPartitionCompatibility(metadata_object, source_metadata_ptr->getPartitionKeyAST()); + ExportPartitionUtils::verifyIcebergPartitionCompatibility( + metadata_object, + source_metadata_ptr->getPartitionKeyAST(), + source_metadata_ptr, + destination_metadata_ptr, + dest_storage->getStorageID()); } #else (void)iceberg_metadata_json_; @@ -6770,12 +6775,23 @@ void MergeTreeData::exportPartToTable( query_context); } - /// for data lakes this check is performed differently. It is a bit more complex as we need to convert the iceberg partition spec - /// to the MergeTree partition spec and compare the two. + /// For data lakes (Iceberg) the partition-key checks — both spec/transform + /// compatibility and ClickHouse partition-column exact-type equality — are + /// performed inside `verifyIcebergPartitionCompatibility` above. + /// For non-data-lake destinations we do the AST equivalence check here and + /// then run the partition-column exact-type check directly. if (!dest_storage->isDataLake()) { if (query_to_string(source_metadata_ptr->getPartitionKeyAST()) != query_to_string(destination_metadata_ptr->getPartitionKeyAST())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key"); + + /// EXPORT does not re-evaluate the partition key against the destination's schema: + /// the hive partition path is derived from the SOURCE part's minmax block in + /// `ExportPartTask::executeStep`, while the row data is CAST to destination types + /// by `addExportConvertingActions`. Reject castable type differences on + /// partition-key columns synchronously so the path and the row data cannot disagree. + ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch( + source_metadata_ptr, destination_metadata_ptr, dest_storage->getStorageID()); } auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 9104884a2a91..cb576f137546 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -8407,14 +8407,24 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & query_context); } - /// for data lakes this check is performed later. It is a bit more complex as we need to convert the iceberg partition spec - /// to the MergeTree partition spec and compare the two. + /// For data lakes (Iceberg) the partition-key checks — both spec/transform + /// compatibility and ClickHouse partition-column exact-type equality — are + /// performed inside `verifyIcebergPartitionCompatibility` below (per partition). + /// For non-data-lake destinations we do the AST equivalence check here and + /// then run the partition-column exact-type check directly. if (!dest_storage->isDataLake()) { if (query_to_string(src_snapshot->getPartitionKeyAST()) != query_to_string(destination_snapshot->getPartitionKeyAST())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key"); + + /// EXPORT does not re-evaluate the partition key against the destination's schema: + /// the hive partition path is derived from the SOURCE part's minmax block in + /// `ExportPartTask::executeStep`, while the row data is CAST to destination types + /// by `addExportConvertingActions`. Reject castable type differences on + /// partition-key columns synchronously so the path and the row data cannot disagree. + ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch( + src_snapshot, destination_snapshot, dest_storage->getStorageID()); } - zkutil::ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly(); @@ -8586,7 +8596,11 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & const auto metadata_object = iceberg_metadata->getMetadataJSON(query_context); ExportPartitionUtils::verifyIcebergPartitionCompatibility( - metadata_object, src_snapshot->getPartitionKeyAST()); + metadata_object, + src_snapshot->getPartitionKeyAST(), + src_snapshot, + destination_snapshot, + dest_storage->getStorageID()); std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM oss.exceptions(std::ios::failbit); diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py index 9dc3b2f8434e..43ba24116526 100644 --- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py +++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py @@ -385,6 +385,73 @@ def test_export_part_with_year_transform_partition(cluster): node.query(f"DROP TABLE IF EXISTS {iceberg}") +def test_export_part_partition_column_type_mismatch_is_rejected(cluster): + """ + EXPORT PART must synchronously reject (BAD_ARGUMENTS) when a partition + column has different (even castable) types on source and destination. + + The partition path / Iceberg partition values are computed from the + SOURCE part's minmax block in ExportPartTask::executeStep before + addExportConvertingActions casts the data, so a castable mismatch on + a partition column would let the path/value computation and the row + data disagree. The check in + ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch rejects this + at ALTER time. + + Failing case: source `year Int32` -> destination `year Int64`, both + PARTITION BY year (identity transform). Both Int32 and Int64 are valid + Iceberg primitives, so verifyIcebergPartitionCompatibility happily + accepts the spec — only the exact-type check rejects. + """ + node = cluster.instances["node1"] + sfx = unique_suffix() + mt = f"mt_pcol_type_mismatch_{sfx}" + iceberg = f"iceberg_pcol_type_mismatch_{sfx}" + + make_mt(node, mt, "id Int32, year Int32", "year") + make_iceberg_s3(node, iceberg, "id Int32, year Int64", "year") + + node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020), (3, 2020)") + + part_2020 = get_part(node, mt, "2020") + + error = node.query_and_get_error( + f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} " + f"SETTINGS allow_experimental_export_merge_tree_part = 1, " + f"allow_experimental_insert_into_iceberg = 1" + ) + assert "BAD_ARGUMENTS" in error, ( + f"Expected BAD_ARGUMENTS for partition-column type mismatch, " + f"got: {error!r}" + ) + assert "partition key column 'year'" in error, ( + f"Expected the error message to identify the offending partition " + f"column 'year', got: {error!r}" + ) + + # No async work scheduled: no ExportPart row in system.part_log. + node.query("SYSTEM FLUSH LOGS") + part_log_count = node.query( + f"SELECT count() FROM system.part_log " + f"WHERE event_type = 'ExportPart' " + f" AND database = currentDatabase() " + f" AND table = '{mt}' " + f" AND part_name = '{part_2020}'" + ).strip() + assert part_log_count == "0", ( + f"Expected no ExportPart entries in system.part_log after a " + f"synchronously-rejected export, found {part_log_count}." + ) + + count = int(node.query(f"SELECT count() FROM {iceberg}").strip()) + assert count == 0, ( + f"Expected 0 rows in Iceberg table after rejected export, got {count}" + ) + + node.query(f"DROP TABLE IF EXISTS {mt} SYNC") + node.query(f"DROP TABLE IF EXISTS {iceberg}") + + def test_export_part_partition_key_mismatch_is_rejected(cluster): """ EXPORT PART must synchronously reject (BAD_ARGUMENTS) when the source diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py index 8b49589a3005..e6e5127bdd72 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py @@ -746,6 +746,72 @@ def test_partition_key_compatibility_check(cluster): ) +def test_export_partition_partition_column_type_mismatch_is_rejected(cluster): + """ + EXPORT PARTITION must synchronously reject (BAD_ARGUMENTS) when a + partition column has a different (even castable) type on the source + MergeTree and the destination Iceberg table. + + The Iceberg destination's partition path is derived from the SOURCE + part's minmax block in ExportPartTask::executeStep, and the Iceberg + manifest's partition values come from the SOURCE-typed + `manifest.data_part->partition.value` Fields. Letting a castable type + mismatch through would mean the manifest/path computation and the row + data (which IS cast by addExportConvertingActions) disagree — not + equivalent to INSERT SELECT. + + Failing case: source `year Int32` -> destination `year Int64`, both + PARTITION BY year (identity transform). Both are valid Iceberg primitives + so verifyIcebergPartitionCompatibility accepts the spec; only the new + exact-type check rejects. + """ + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_pcol_type_mismatch_{uid}" + iceberg_table = f"iceberg_pcol_type_mismatch_{uid}" + + # Source uses year Int32; destination Iceberg uses year Int64. + make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1") + node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020), (3, 2020)") + + make_iceberg_s3( + node, iceberg_table, "id Int64, year Int64", + partition_by="year", s3_retry_attempts=3, + ) + + error = node.query_and_get_error( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}" + ) + assert "BAD_ARGUMENTS" in error, ( + f"Expected BAD_ARGUMENTS for partition-column type mismatch, " + f"got: {error!r}" + ) + assert "partition key column 'year'" in error, ( + f"Expected the error message to identify the offending partition " + f"column 'year', got: {error!r}" + ) + + # No async work scheduled. + rows_in_system_view = node.query( + f"SELECT count() FROM system.replicated_partition_exports " + f"WHERE source_table = '{mt_table}' " + f" AND destination_table = '{iceberg_table}' " + f" AND partition_id = '2020'" + ).strip() + assert rows_in_system_view == "0", ( + f"Expected no row in system.replicated_partition_exports after a " + f"synchronously-rejected export, got {rows_in_system_view}." + ) + + # Destination must remain empty. + count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) + assert count == 0, ( + f"Expected 0 rows in Iceberg destination after rejected export, " + f"got {count}" + ) + + def test_export_ttl(cluster): """ After a manifest TTL expires the same partition can be re-exported, and the diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py index 3161e3b67100..5c3e656107bc 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py @@ -1532,6 +1532,119 @@ def test_export_partition_all(cluster): assert row_count == 3, f"Expected 3 rows in S3 after EXPORT PARTITION ALL, got {row_count}" +def test_export_partition_partition_column_castable_type_mismatch(cluster): + """ + Post-fix contract: EXPORT PARTITION must reject castable but non-equal + partition-column types synchronously. + + Setup + ----- + * Source (ReplicatedMergeTree): ``id UInt64, year String``, + ``PARTITION BY year`` + * Destination (S3, partition_strategy='hive'): ``id UInt64, + year UInt16``, ``PARTITION BY year`` + + Why the synchronous rejection is required + ----------------------------------------- + ``makeConvertingActions(Position)`` accepts ``String -> UInt16`` and the + PARTITION BY ASTs are textually identical, so the schema/AST validators + would let this through. But EXPORT does not re-evaluate the partition + key against the destination's schema: in ``ExportPartTask::executeStep`` + the block fed to ``destination_storage->import`` is built from the + source part's minmax index (SOURCE types). If we let the export proceed + the hive partition path is computed from the un-cast SOURCE value while + the row data is the CAST value — silent divergence from INSERT SELECT. + The exact-type partition-column check in + ``ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch`` rejects + this pairing at ALTER time. + + Assertions + ---------- + * ``ALTER ... EXPORT PARTITION ID '' TO TABLE ...`` raises a + ``BAD_ARGUMENTS`` exception whose text references the partition + column type mismatch. + * No row is created in ``system.replicated_partition_exports`` for + this (source_table, destination_table, partition_id) — nothing + was ever scheduled. + * No parquet file is written under any ``year=*/`` prefix in S3. + """ + skip_if_remote_database_disk_enabled(cluster) + node = cluster.instances["replica1"] + + postfix = str(uuid.uuid4()).replace("-", "_") + mt_table = f"pkey_cast_mismatch_partition_mt_{postfix}" + s3_table = f"pkey_cast_mismatch_partition_s3_{postfix}" + + # Source: year String; destination: year UInt16. PARTITION BY year on + # both sides — same AST text — to defeat the AST equivalence check. + node.query( + f"CREATE TABLE {mt_table} (id UInt64, year String) " + f"ENGINE = ReplicatedMergeTree('/clickhouse/tables/{mt_table}', 'replica1') " + f"PARTITION BY year " + f"ORDER BY tuple()" + ) + node.query( + f"CREATE TABLE {s3_table} (id UInt64, year UInt16) " + f"ENGINE = S3(s3_conn, filename='{s3_table}', " + f"format=Parquet, partition_strategy='hive') " + f"PARTITION BY year" + ) + + node.query( + f"INSERT INTO {mt_table} VALUES (1, '2020'), (2, '2020'), (3, '2020')" + ) + + # With a String partition column the partition_id is the SipHash of the + # value rather than the textual representation — look it up so we can + # reference the partition explicitly in EXPORT PARTITION ID and in + # subsequent system.replicated_partition_exports queries. + partition_id = node.query( + f"SELECT partition_id FROM system.parts " + f"WHERE database = currentDatabase() AND table = '{mt_table}' " + f" AND active " + f"ORDER BY name LIMIT 1" + ).strip() + assert partition_id, ( + "Expected one active part on the source table after INSERT; " + "system.parts returned nothing." + ) + + error = node.query_and_get_error( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{partition_id}' " + f"TO TABLE {s3_table}" + ) + assert "BAD_ARGUMENTS" in error, ( + f"Expected BAD_ARGUMENTS for partition-column type mismatch, " + f"got: {error!r}" + ) + assert "partition key column 'year'" in error, ( + f"Expected the error message to identify the offending partition " + f"column 'year', got: {error!r}" + ) + + # Nothing scheduled: no row in system.replicated_partition_exports. + rows_in_system_view = node.query( + f"SELECT count() FROM system.replicated_partition_exports " + f"WHERE source_table = '{mt_table}' " + f" AND destination_table = '{s3_table}' " + f" AND partition_id = '{partition_id}'" + ).strip() + assert rows_in_system_view == "0", ( + f"Expected no row in system.replicated_partition_exports after a " + f"synchronously-rejected export, got {rows_in_system_view}." + ) + + # Nothing written: no parquet file under any year=*/ partition prefix. + files_in_s3 = node.query( + f"SELECT count() FROM s3(s3_conn, " + f"filename='{s3_table}/year=*/*.parquet', format='One')" + ).strip() + assert files_in_s3 == "0", ( + f"Expected no Parquet files in S3 after a synchronously-rejected " + f"export, found {files_in_s3}." + ) + + def test_export_partition_all_failure_modes(cluster): """Cover the three values of `export_merge_tree_partition_all_on_error`. diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql index ceb83191f93a..9357f4e5aa78 100644 --- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql +++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql @@ -1,6 +1,6 @@ -- Tags: no-parallel, no-fasttest -DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table; +DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3; SET allow_experimental_export_merge_tree_part=1; @@ -39,4 +39,16 @@ INSERT INTO 03572_ephemeral_mt_table (id, year, name) VALUES (1, 2020, 'alice'); ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError NUMBER_OF_COLUMNS_DOESNT_MATCH} -DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table; +-- Castable but non-equal partition-column types must be rejected synchronously. +-- The partition path is computed from the source part's minmax block (SOURCE types) +-- BEFORE row data is CAST to destination types — letting String -> UInt16 through +-- would cause silent divergence between partition path and row values. +CREATE TABLE 03572_partition_type_mismatch_mt (id UInt64, year String) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple(); +CREATE TABLE 03572_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; + +-- Partition-column type mismatch is checked synchronously BEFORE the part lookup, +-- so we don't need to insert data nor use a real part name to provoke it. +ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_partition_type_mismatch_s3 +SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} + +DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3; diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql index 0f6647861e43..874b5f288cf7 100644 --- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql +++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql @@ -1,6 +1,6 @@ -- Tags: no-parallel, no-fasttest -DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table; +DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3; CREATE TABLE 03572_rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt/03572_rmt_table', 'replica1') PARTITION BY year ORDER BY tuple(); @@ -21,4 +21,14 @@ CREATE TABLE 03572_invalid_schema_table (id UInt64, year UInt16) ENGINE = S3(s3_ ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NOT_IMPLEMENTED} -DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table; +-- Castable but non-equal partition-column types must be rejected synchronously. +-- The partition path is computed from the source part's minmax block (SOURCE types) +-- BEFORE row data is CAST to destination types — letting String -> UInt16 through +-- would cause silent divergence between partition path and row values. +CREATE TABLE 03572_rmt_partition_type_mismatch_mt (id UInt64, year String) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_pcol_type/03572_rmt_partition_type_mismatch_mt', 'replica1') PARTITION BY year ORDER BY tuple(); +CREATE TABLE 03572_rmt_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; + +ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_partition_type_mismatch_s3 +SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} + +DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3; From 67489f2e7f84ca9e2a124937c27859429a0aa2e6 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 8 Jun 2026 10:26:33 -0300 Subject: [PATCH 05/33] fix build and tests --- .../MergeTree/ExportPartitionUtils.cpp | 1 + .../test.py | 264 +++++++++++++++++- 2 files changed, 264 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp index d5df9b6bea2a..f227d318a1fd 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.cpp +++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #if USE_AVRO #include diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py index e6e5127bdd72..45aa565b3586 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py @@ -781,7 +781,8 @@ def test_export_partition_partition_column_type_mismatch_is_rejected(cluster): ) error = node.query_and_get_error( - f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}" + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", + settings={"allow_insert_into_iceberg": 1}, ) assert "BAD_ARGUMENTS" in error, ( f"Expected BAD_ARGUMENTS for partition-column type mismatch, " @@ -1050,3 +1051,264 @@ def test_export_partition_writes_column_statistics(cluster): entries = fetch_manifest_entries(node, query_id) assert_exported_stats(entries) + + +# --------------------------------------------------------------------------- +# Schema compatibility (INSERT SELECT mirror) +# +# EXPORT PARTITION builds an ActionsDAG via makeConvertingActions(Position) — the +# same primitive that powers INSERT INTO dest SELECT * FROM src. The destination +# compatibility validation in StorageReplicatedMergeTree::exportPartitionToTable +# mirrors what the plain-MergeTree EXPORT PART path does in MergeTreeData, so the +# tests below pin the same contract for the replicated partition export: +# +# * Column counts must match positionally; otherwise the ALTER is rejected +# synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing is scheduled. +# * Destination column names need not match source names — positional pairs +# are CAST element-wise. +# * Castable type differences are accepted at validation time (widening and +# even lossy narrowing). Lossiness only matters at runtime: if the value +# does not fit the destination type, the async export worker fails and the +# export is marked FAILED in system.replicated_partition_exports; the Iceberg +# table is left untouched. +# --------------------------------------------------------------------------- + + +def test_export_partition_column_count_mismatch_source_more_is_rejected(cluster): + """ + Source has 3 columns (id, year, extra), destination has 2 (id, year). + The ALTER must be rejected synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH, + nothing must be scheduled in system.replicated_partition_exports, and the + Iceberg table must remain empty. + """ + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_count_more_{uid}" + iceberg_table = f"iceberg_count_more_{uid}" + + make_rmt(node, mt_table, "id Int64, year Int32, extra String", "year", + replica_name="replica1") + node.query(f"INSERT INTO {mt_table} VALUES (1, 2020, 'foo'), (2, 2020, 'bar')") + + make_iceberg_s3(node, iceberg_table, "id Int64, year Int32", partition_by="year") + + error = node.query_and_get_error( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", + settings={"allow_insert_into_iceberg": 1}, + ) + assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, ( + f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source>dest column count, " + f"got: {error!r}" + ) + + rows_in_system_view = node.query( + f"SELECT count() FROM system.replicated_partition_exports " + f"WHERE source_table = '{mt_table}' " + f" AND destination_table = '{iceberg_table}' " + f" AND partition_id = '2020'" + ).strip() + assert rows_in_system_view == "0", ( + f"Expected no row in system.replicated_partition_exports after a " + f"synchronously-rejected export, got {rows_in_system_view}." + ) + + count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) + assert count == 0, ( + f"Expected 0 rows in Iceberg table after rejected export, got {count}" + ) + + +def test_export_partition_column_count_mismatch_source_fewer_is_rejected(cluster): + """ + Source has 2 columns (id, year), destination has 3 (id, year, extra). + Same expected synchronous rejection as the source>dest case. + """ + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_count_fewer_{uid}" + iceberg_table = f"iceberg_count_fewer_{uid}" + + make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1") + node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020)") + + make_iceberg_s3(node, iceberg_table, "id Int64, year Int32, extra String", + partition_by="year") + + error = node.query_and_get_error( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", + settings={"allow_insert_into_iceberg": 1}, + ) + assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, ( + f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source Int32 is a + defined conversion) so the ALTER returns synchronously; the actual cast + runs in the async export worker and must fail there. The failure exhausts + the retry budget and the export is marked FAILED in + system.replicated_partition_exports with a non-zero exception_count, while + the Iceberg table is left empty. + + (Integer narrowing overflow is not used here because the internal cast + `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps + on overflow rather than throwing — that would still produce a successful + export with garbage data, not a propagated failure.) + """ + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_runtime_cast_fail_{uid}" + iceberg_table = f"iceberg_runtime_cast_fail_{uid}" + + make_rmt(node, mt_table, "id String, year Int32", "year", replica_name="replica1") + node.query(f"INSERT INTO {mt_table} VALUES ('not a number', 2020)") + + make_iceberg_s3(node, iceberg_table, "id Int32, year Int32", partition_by="year") + + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} " + f"SETTINGS export_merge_tree_partition_max_retries = 1, allow_insert_into_iceberg = 1" + ) + + wait_for_export_status(node, mt_table, iceberg_table, "2020", "FAILED", timeout=60) + + exception_count = int(node.query( + f"SELECT any(exception_count) FROM system.replicated_partition_exports " + f"WHERE source_table = '{mt_table}' " + f" AND destination_table = '{iceberg_table}' " + f" AND partition_id = '2020'" + ).strip()) + assert exception_count > 0, ( + "Expected non-zero exception_count after a failed runtime cast" + ) + + count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) + assert count == 0, ( + f"Expected 0 rows in Iceberg table after failed export, got {count}" + ) From 245c0bb5a3c887b5d642d298435325be16aa8cfa Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 15 Jun 2026 14:20:41 -0300 Subject: [PATCH 06/33] yet another impl --- src/Core/Settings.cpp | 3 + src/Core/SettingsChangesHistory.cpp | 1 + .../MergeTree/ExportPartitionUtils.cpp | 89 +++++++++---------- src/Storages/MergeTree/ExportPartitionUtils.h | 40 +++------ src/Storages/MergeTree/MergeTreeData.cpp | 41 ++------- src/Storages/StorageReplicatedMergeTree.cpp | 43 ++------- .../test.py | 61 +++---------- .../test.py | 59 +++--------- .../test.py | 45 ++-------- ...rge_tree_part_to_object_storage_simple.sql | 35 ++++++-- ...rge_tree_part_to_object_storage_simple.sql | 33 +++++-- 11 files changed, 157 insertions(+), 293 deletions(-) diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index ba022db0f83a..6368ec01c1cb 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -7605,6 +7605,9 @@ Has no effect on `EXPORT PARTITION ` (single-partition export). )", 0) \ DECLARE(String, export_merge_tree_part_filename_pattern, "{part_name}_{checksum}", R"( Pattern for the filename of the exported merge tree part. The `part_name` and `checksum` are calculated and replaced on the fly. Additional macros are supported. +)", 0) \ + DECLARE(Bool, export_merge_tree_part_allow_lossy_cast, false, R"( +Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead. )", 0) \ \ /* ####################################################### */ \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 50cc8c572561..3a72ec667415 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -43,6 +43,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() { {"object_storage_cluster_join_mode", "allow", "allow", "New setting"}, {"export_merge_tree_partition_task_timeout_seconds", "3600", "86400", "Increase default value to make it more realistic"}, + {"export_merge_tree_part_allow_lossy_cast", false, false, "New setting to gate lossy casts in EXPORT PART/PARTITION behind explicit acknowledgment"}, }); addSettingsChanges(settings_changes_history, "26.3", { diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp index f227d318a1fd..ef51b10b9c0a 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.cpp +++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp @@ -9,6 +9,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -38,6 +42,11 @@ namespace ErrorCodes extern const int NETWORK_ERROR; } +namespace Setting +{ + extern const SettingsBool export_merge_tree_part_allow_lossy_cast; +} + namespace FailPoints { extern const char iceberg_export_after_commit_before_zk_completed[]; @@ -404,18 +413,8 @@ namespace ExportPartitionUtils #if USE_AVRO void verifyIcebergPartitionCompatibility( const Poco::JSON::Object::Ptr & metadata_object, - const ASTPtr & partition_key_ast, - const StorageMetadataPtr & source_metadata, - const StorageMetadataPtr & destination_metadata, - const StorageID & destination_storage_id) + const ASTPtr & partition_key_ast) { - /// The Iceberg manifest's partition values are derived from the SOURCE - /// part's minmax block (see `IcebergMetadata::commitExportPartitionTransaction`) - /// while the row data is CAST to the destination Iceberg schema by - /// `addExportConvertingActions`. Reject castable type differences on - /// partition-key columns up front so the two cannot disagree. - verifyPartitionColumnsExactTypeMatch(source_metadata, destination_metadata, destination_storage_id); - const auto original_schema_id = metadata_object->getValue(Iceberg::f_current_schema_id); const auto partition_spec_id = metadata_object->getValue(Iceberg::f_default_spec_id); @@ -531,51 +530,47 @@ namespace ExportPartitionUtils } #endif - void verifyPartitionColumnsExactTypeMatch( + void verifyExportSchemaCastable( const StorageMetadataPtr & source_metadata, const StorageMetadataPtr & destination_metadata, - const StorageID & destination_storage_id) + const StorageID & destination_storage_id, + const ContextPtr & context) { - if (!source_metadata->hasPartitionKey()) - return; + /// Build (and discard) the same converting DAG the export worker will build + /// later, to surface structural mismatches (column count, untyped casts) early. + Block source_sample_block; + for (const auto & column : source_metadata->getColumns().getReadable()) + source_sample_block.insert({column.type->createColumn(), column.type, column.name}); + + const auto destination_sample_block = destination_metadata->getSampleBlockNonMaterialized(); - /// Use the partition expression's required columns (the ones that flow - /// into the destination's partition strategy via - /// `minmax_idx->getBlock(storage)`), not the partition expression's - /// sample-block output columns. This catches both identity - /// partitioning (`PARTITION BY year`) and computed partitioning - /// (`PARTITION BY toYearNumSinceEpoch(event_date)` — required column - /// is `event_date`). - const auto required_columns - = source_metadata->getPartitionKey().expression->getRequiredColumnsWithTypes(); + const auto source_columns = source_sample_block.getColumnsWithTypeAndName(); + const auto destination_columns = destination_sample_block.getColumnsWithTypeAndName(); - const auto & destination_columns = destination_metadata->getColumns(); + (void) ActionsDAG::makeConvertingActions( + source_columns, + destination_columns, + ActionsDAG::MatchColumnsMode::Position, + context); - for (const auto & source_column : required_columns) + /// Lossy casts may silently change values, so reject them unless the user opts in. + if (context->getSettingsRef()[Setting::export_merge_tree_part_allow_lossy_cast]) + return; + + const size_t num_columns = std::min(source_columns.size(), destination_columns.size()); + for (size_t i = 0; i < num_columns; ++i) { - if (!destination_columns.has(source_column.name)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Cannot export partition: source partition key references column '{}' " - "which does not exist on the destination table {}. " - "EXPORT does not re-evaluate the partition key against the destination's schema, " - "so every column the source partition expression depends on must be present " - "on the destination with the same type.", - source_column.name, destination_storage_id.getFullTableName()); - - const auto destination_type - = destination_columns.getPhysical(source_column.name).type; - if (!source_column.type->equals(*destination_type)) + const auto & source_column = source_columns[i]; + const auto & destination_column = destination_columns[i]; + if (!canBeSafelyCast(source_column.type, destination_column.type)) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Cannot export partition: partition key column '{}' has type {} on the source " - "but {} on the destination table {}. " - "EXPORT does not re-evaluate the partition key against the destination's schema, " - "so castable type differences are not allowed for partition columns " - "(the partition path and any Iceberg partition values are derived from the " - "source-typed minmax block, while the row data is CAST to the destination type).", - source_column.name, + "Cannot export to {}: column '{}' requires a lossy cast from {} to {}, " + "which may change values. Set `export_merge_tree_part_allow_lossy_cast = 1` " + "to allow lossy casts during export.", + destination_storage_id.getFullTableName(), + destination_column.name, source_column.type->getName(), - destination_type->getName(), - destination_storage_id.getFullTableName()); + destination_column.type->getName()); } } } diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h index 958a8a819457..d97bb538a47f 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.h +++ b/src/Storages/MergeTree/ExportPartitionUtils.h @@ -89,41 +89,23 @@ namespace ExportPartitionUtils const std::string & exception_message, const LoggerPtr & log); - /// EXPORT PART/PARTITION never re-evaluates the partition key against the - /// destination's schema: the partition path (object storage) and the - /// Iceberg partition values (data lake) are derived from the SOURCE part's - /// minmax block in ExportPartTask::executeStep. Therefore every column the - /// SOURCE partition expression depends on must have an identically-typed - /// counterpart on the destination — castable but non-equal types are not - /// safe here even though `makeConvertingActions` would accept them. - /// - /// Called directly from the non-data-lake branch of - /// `MergeTreeData::exportPartToTable` / - /// `StorageReplicatedMergeTree::exportPartitionToTable`, and indirectly - /// for the Iceberg branch via `verifyIcebergPartitionCompatibility` which - /// embeds this check. Throws BAD_ARGUMENTS on missing column or type - /// mismatch. - void verifyPartitionColumnsExactTypeMatch( + /// Validates that source columns can be exported into the destination with the + /// same positional CAST matching as `INSERT INTO dest SELECT * FROM src`. Lossy + /// casts are rejected unless `export_merge_tree_part_allow_lossy_cast` is set. + /// Throws BAD_ARGUMENTS on any violation. + void verifyExportSchemaCastable( const StorageMetadataPtr & source_metadata, const StorageMetadataPtr & destination_metadata, - const StorageID & destination_storage_id); + const StorageID & destination_storage_id, + const ContextPtr & context); #if USE_AVRO - /// Verifies that the source MergeTree partition key is compatible with the - /// destination Iceberg partition spec by: - /// * Comparing partition-field source-ids and transforms in order - /// against the Iceberg spec derived from the MergeTree PARTITION BY AST. - /// * Embedding `verifyPartitionColumnsExactTypeMatch` so that columns - /// flowing through the partition key have identical ClickHouse types on - /// both sides — the Iceberg manifest's partition values come from the - /// SOURCE-typed minmax block and cannot be reconciled later. - /// Throws BAD_ARGUMENTS on any mismatch. + /// Verifies the source MergeTree partition key matches the destination Iceberg + /// partition spec (source-ids and transforms in order). Throws BAD_ARGUMENTS on + /// mismatch. void verifyIcebergPartitionCompatibility( const Poco::JSON::Object::Ptr & metadata_object, - const ASTPtr & partition_key_ast, - const StorageMetadataPtr & source_metadata, - const StorageMetadataPtr & destination_metadata, - const StorageID & destination_storage_id); + const ASTPtr & partition_key_ast); #endif } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index ee2b7d2250c8..937c675fab2f 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6745,10 +6745,7 @@ void MergeTreeData::exportPartToTable( ExportPartitionUtils::verifyIcebergPartitionCompatibility( metadata_object, - source_metadata_ptr->getPartitionKeyAST(), - source_metadata_ptr, - destination_metadata_ptr, - dest_storage->getStorageID()); + source_metadata_ptr->getPartitionKeyAST()); } #else (void)iceberg_metadata_json_; @@ -6756,42 +6753,16 @@ void MergeTreeData::exportPartToTable( #endif } - /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` — - /// positional matching with CAST. We surface structural mismatches synchronously at - /// ALTER TABLE time by building (and discarding) the same converting DAG the worker - /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch, - /// untyped cast) fails here; anything it accepts will be CAST at runtime. - { - Block source_sample_block; - for (const auto & column : source_metadata_ptr->getColumns().getReadable()) - source_sample_block.insert({column.type->createColumn(), column.type, column.name}); - - const auto destination_sample_block = destination_metadata_ptr->getSampleBlockNonMaterialized(); + /// Positional CAST matching, like `INSERT INTO dest SELECT * FROM src`. + ExportPartitionUtils::verifyExportSchemaCastable( + source_metadata_ptr, destination_metadata_ptr, dest_storage->getStorageID(), query_context); - (void) ActionsDAG::makeConvertingActions( - source_sample_block.getColumnsWithTypeAndName(), - destination_sample_block.getColumnsWithTypeAndName(), - ActionsDAG::MatchColumnsMode::Position, - query_context); - } - - /// For data lakes (Iceberg) the partition-key checks — both spec/transform - /// compatibility and ClickHouse partition-column exact-type equality — are - /// performed inside `verifyIcebergPartitionCompatibility` above. - /// For non-data-lake destinations we do the AST equivalence check here and - /// then run the partition-column exact-type check directly. + /// Iceberg partition compatibility is checked above; here we only need the + /// partition-key ASTs to match (partition-column types follow the lossy-cast gate). if (!dest_storage->isDataLake()) { if (query_to_string(source_metadata_ptr->getPartitionKeyAST()) != query_to_string(destination_metadata_ptr->getPartitionKeyAST())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key"); - - /// EXPORT does not re-evaluate the partition key against the destination's schema: - /// the hive partition path is derived from the SOURCE part's minmax block in - /// `ExportPartTask::executeStep`, while the row data is CAST to destination types - /// by `addExportConvertingActions`. Reject castable type differences on - /// partition-key columns synchronously so the path and the row data cannot disagree. - ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch( - source_metadata_ptr, destination_metadata_ptr, dest_storage->getStorageID()); } auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index cb576f137546..2301ec1c1a30 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -8388,42 +8388,16 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & auto src_snapshot = getInMemoryMetadataPtr(); auto destination_snapshot = dest_storage->getInMemoryMetadataPtr(); - /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` — - /// positional matching with CAST. We surface structural mismatches synchronously at - /// ALTER TABLE time by building (and discarding) the same converting DAG the worker - /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch, - /// untyped cast) fails here; anything it accepts will be CAST at runtime. - { - Block source_sample_block; - for (const auto & column : src_snapshot->getColumns().getReadable()) - source_sample_block.insert({column.type->createColumn(), column.type, column.name}); - - const auto destination_sample_block = destination_snapshot->getSampleBlockNonMaterialized(); - - (void) ActionsDAG::makeConvertingActions( - source_sample_block.getColumnsWithTypeAndName(), - destination_sample_block.getColumnsWithTypeAndName(), - ActionsDAG::MatchColumnsMode::Position, - query_context); - } - - /// For data lakes (Iceberg) the partition-key checks — both spec/transform - /// compatibility and ClickHouse partition-column exact-type equality — are - /// performed inside `verifyIcebergPartitionCompatibility` below (per partition). - /// For non-data-lake destinations we do the AST equivalence check here and - /// then run the partition-column exact-type check directly. + /// Positional CAST matching, like `INSERT INTO dest SELECT * FROM src`. + ExportPartitionUtils::verifyExportSchemaCastable( + src_snapshot, destination_snapshot, dest_storage->getStorageID(), query_context); + + /// Iceberg partition compatibility is checked below; here we only need the + /// partition-key ASTs to match (partition-column types follow the lossy-cast gate). if (!dest_storage->isDataLake()) { if (query_to_string(src_snapshot->getPartitionKeyAST()) != query_to_string(destination_snapshot->getPartitionKeyAST())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key"); - - /// EXPORT does not re-evaluate the partition key against the destination's schema: - /// the hive partition path is derived from the SOURCE part's minmax block in - /// `ExportPartTask::executeStep`, while the row data is CAST to destination types - /// by `addExportConvertingActions`. Reject castable type differences on - /// partition-key columns synchronously so the path and the row data cannot disagree. - ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch( - src_snapshot, destination_snapshot, dest_storage->getStorageID()); } zkutil::ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly(); @@ -8597,10 +8571,7 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & ExportPartitionUtils::verifyIcebergPartitionCompatibility( metadata_object, - src_snapshot->getPartitionKeyAST(), - src_snapshot, - destination_snapshot, - dest_storage->getStorageID()); + src_snapshot->getPartitionKeyAST()); std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM oss.exceptions(std::ios::failbit); diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py index 43ba24116526..f51dfa3c602e 100644 --- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py +++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py @@ -385,28 +385,12 @@ def test_export_part_with_year_transform_partition(cluster): node.query(f"DROP TABLE IF EXISTS {iceberg}") -def test_export_part_partition_column_type_mismatch_is_rejected(cluster): - """ - EXPORT PART must synchronously reject (BAD_ARGUMENTS) when a partition - column has different (even castable) types on source and destination. - - The partition path / Iceberg partition values are computed from the - SOURCE part's minmax block in ExportPartTask::executeStep before - addExportConvertingActions casts the data, so a castable mismatch on - a partition column would let the path/value computation and the row - data disagree. The check in - ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch rejects this - at ALTER time. - - Failing case: source `year Int32` -> destination `year Int64`, both - PARTITION BY year (identity transform). Both Int32 and Int64 are valid - Iceberg primitives, so verifyIcebergPartitionCompatibility happily - accepts the spec — only the exact-type check rejects. - """ +def test_export_part_partition_column_lossless_widening(cluster): + """A lossless widening of a partition column (year Int32 -> Int64) round-trips.""" node = cluster.instances["node1"] sfx = unique_suffix() - mt = f"mt_pcol_type_mismatch_{sfx}" - iceberg = f"iceberg_pcol_type_mismatch_{sfx}" + mt = f"mt_pcol_widening_{sfx}" + iceberg = f"iceberg_pcol_widening_{sfx}" make_mt(node, mt, "id Int32, year Int32", "year") make_iceberg_s3(node, iceberg, "id Int32, year Int64", "year") @@ -414,39 +398,20 @@ def test_export_part_partition_column_type_mismatch_is_rejected(cluster): node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020), (3, 2020)") part_2020 = get_part(node, mt, "2020") + export_part(node, mt, part_2020, iceberg) + wait_for_export_part(node, mt, part_2020) - error = node.query_and_get_error( - f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} " - f"SETTINGS allow_experimental_export_merge_tree_part = 1, " - f"allow_experimental_insert_into_iceberg = 1" - ) - assert "BAD_ARGUMENTS" in error, ( - f"Expected BAD_ARGUMENTS for partition-column type mismatch, " - f"got: {error!r}" - ) - assert "partition key column 'year'" in error, ( - f"Expected the error message to identify the offending partition " - f"column 'year', got: {error!r}" - ) + count = int(node.query(f"SELECT count() FROM {iceberg}").strip()) + assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}" - # No async work scheduled: no ExportPart row in system.part_log. - node.query("SYSTEM FLUSH LOGS") - part_log_count = node.query( - f"SELECT count() FROM system.part_log " - f"WHERE event_type = 'ExportPart' " - f" AND database = currentDatabase() " - f" AND table = '{mt}' " - f" AND part_name = '{part_2020}'" + result = node.query( + f"SELECT id, toTypeName(year), year FROM {iceberg} ORDER BY id" ).strip() - assert part_log_count == "0", ( - f"Expected no ExportPart entries in system.part_log after a " - f"synchronously-rejected export, found {part_log_count}." + assert result == "1\tInt64\t2020\n2\tInt64\t2020\n3\tInt64\t2020", ( + f"Unexpected widened partition-column data:\n{result}" ) - count = int(node.query(f"SELECT count() FROM {iceberg}").strip()) - assert count == 0, ( - f"Expected 0 rows in Iceberg table after rejected export, got {count}" - ) + assert_part_log(node, mt, part_2020) node.query(f"DROP TABLE IF EXISTS {mt} SYNC") node.query(f"DROP TABLE IF EXISTS {iceberg}") diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py index 45aa565b3586..6bd080bc72f0 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py @@ -746,30 +746,13 @@ def test_partition_key_compatibility_check(cluster): ) -def test_export_partition_partition_column_type_mismatch_is_rejected(cluster): - """ - EXPORT PARTITION must synchronously reject (BAD_ARGUMENTS) when a - partition column has a different (even castable) type on the source - MergeTree and the destination Iceberg table. - - The Iceberg destination's partition path is derived from the SOURCE - part's minmax block in ExportPartTask::executeStep, and the Iceberg - manifest's partition values come from the SOURCE-typed - `manifest.data_part->partition.value` Fields. Letting a castable type - mismatch through would mean the manifest/path computation and the row - data (which IS cast by addExportConvertingActions) disagree — not - equivalent to INSERT SELECT. - - Failing case: source `year Int32` -> destination `year Int64`, both - PARTITION BY year (identity transform). Both are valid Iceberg primitives - so verifyIcebergPartitionCompatibility accepts the spec; only the new - exact-type check rejects. - """ +def test_export_partition_partition_column_lossless_widening(cluster): + """A lossless widening of a partition column (year Int32 -> Int64) round-trips.""" node = cluster.instances["replica1"] uid = unique_suffix() - mt_table = f"mt_pcol_type_mismatch_{uid}" - iceberg_table = f"iceberg_pcol_type_mismatch_{uid}" + mt_table = f"mt_pcol_widening_{uid}" + iceberg_table = f"iceberg_pcol_widening_{uid}" # Source uses year Int32; destination Iceberg uses year Int64. make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1") @@ -780,36 +763,20 @@ def test_export_partition_partition_column_type_mismatch_is_rejected(cluster): partition_by="year", s3_retry_attempts=3, ) - error = node.query_and_get_error( + node.query( f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", settings={"allow_insert_into_iceberg": 1}, ) - assert "BAD_ARGUMENTS" in error, ( - f"Expected BAD_ARGUMENTS for partition-column type mismatch, " - f"got: {error!r}" - ) - assert "partition key column 'year'" in error, ( - f"Expected the error message to identify the offending partition " - f"column 'year', got: {error!r}" - ) - - # No async work scheduled. - rows_in_system_view = node.query( - f"SELECT count() FROM system.replicated_partition_exports " - f"WHERE source_table = '{mt_table}' " - f" AND destination_table = '{iceberg_table}' " - f" AND partition_id = '2020'" - ).strip() - assert rows_in_system_view == "0", ( - f"Expected no row in system.replicated_partition_exports after a " - f"synchronously-rejected export, got {rows_in_system_view}." - ) + wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED") - # Destination must remain empty. count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) - assert count == 0, ( - f"Expected 0 rows in Iceberg destination after rejected export, " - f"got {count}" + assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}" + + result = node.query( + f"SELECT id, toTypeName(year), year FROM {iceberg_table} ORDER BY id" + ).strip() + assert result == "1\tInt64\t2020\n2\tInt64\t2020\n3\tInt64\t2020", ( + f"Unexpected widened partition-column data:\n{result}" ) diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py index 5c3e656107bc..d403538999df 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py @@ -1533,41 +1533,8 @@ def test_export_partition_all(cluster): def test_export_partition_partition_column_castable_type_mismatch(cluster): - """ - Post-fix contract: EXPORT PARTITION must reject castable but non-equal - partition-column types synchronously. - - Setup - ----- - * Source (ReplicatedMergeTree): ``id UInt64, year String``, - ``PARTITION BY year`` - * Destination (S3, partition_strategy='hive'): ``id UInt64, - year UInt16``, ``PARTITION BY year`` - - Why the synchronous rejection is required - ----------------------------------------- - ``makeConvertingActions(Position)`` accepts ``String -> UInt16`` and the - PARTITION BY ASTs are textually identical, so the schema/AST validators - would let this through. But EXPORT does not re-evaluate the partition - key against the destination's schema: in ``ExportPartTask::executeStep`` - the block fed to ``destination_storage->import`` is built from the - source part's minmax index (SOURCE types). If we let the export proceed - the hive partition path is computed from the un-cast SOURCE value while - the row data is the CAST value — silent divergence from INSERT SELECT. - The exact-type partition-column check in - ``ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch`` rejects - this pairing at ALTER time. - - Assertions - ---------- - * ``ALTER ... EXPORT PARTITION ID '' TO TABLE ...`` raises a - ``BAD_ARGUMENTS`` exception whose text references the partition - column type mismatch. - * No row is created in ``system.replicated_partition_exports`` for - this (source_table, destination_table, partition_id) — nothing - was ever scheduled. - * No parquet file is written under any ``year=*/`` prefix in S3. - """ + """A lossy partition-column cast (year String -> UInt16) is rejected synchronously + when export_merge_tree_part_allow_lossy_cast is off, scheduling nothing.""" skip_if_remote_database_disk_enabled(cluster) node = cluster.instances["replica1"] @@ -1614,12 +1581,12 @@ def test_export_partition_partition_column_castable_type_mismatch(cluster): f"TO TABLE {s3_table}" ) assert "BAD_ARGUMENTS" in error, ( - f"Expected BAD_ARGUMENTS for partition-column type mismatch, " + f"Expected BAD_ARGUMENTS for a lossy partition-column cast, " f"got: {error!r}" ) - assert "partition key column 'year'" in error, ( - f"Expected the error message to identify the offending partition " - f"column 'year', got: {error!r}" + assert "requires a lossy cast" in error and "'year'" in error, ( + f"Expected the error message to report the lossy cast on column " + f"'year', got: {error!r}" ) # Nothing scheduled: no row in system.replicated_partition_exports. diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql index 9357f4e5aa78..e40c5ebde461 100644 --- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql +++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql @@ -1,6 +1,6 @@ -- Tags: no-parallel, no-fasttest -DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3; +DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3, 03572_lossy_mt, 03572_lossy_s3, 03572_lossless_mt, 03572_lossless_s3; SET allow_experimental_export_merge_tree_part=1; @@ -39,16 +39,37 @@ INSERT INTO 03572_ephemeral_mt_table (id, year, name) VALUES (1, 2020, 'alice'); ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError NUMBER_OF_COLUMNS_DOESNT_MATCH} --- Castable but non-equal partition-column types must be rejected synchronously. --- The partition path is computed from the source part's minmax block (SOURCE types) --- BEFORE row data is CAST to destination types — letting String -> UInt16 through --- would cause silent divergence between partition path and row values. +-- Partition columns follow the same lossy-cast gate as any other column (no special +-- exact-type guard). String -> UInt16 is a lossy cast, so with the default +-- export_merge_tree_part_allow_lossy_cast = 0 it is rejected synchronously. CREATE TABLE 03572_partition_type_mismatch_mt (id UInt64, year String) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple(); CREATE TABLE 03572_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; --- Partition-column type mismatch is checked synchronously BEFORE the part lookup, +-- The lossy-cast check fires synchronously BEFORE the part lookup, -- so we don't need to insert data nor use a real part name to provoke it. ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_partition_type_mismatch_s3 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} -DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3; +-- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values. +-- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a +-- fake part name is enough to provoke it. +CREATE TABLE 03572_lossy_mt (id Int64, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple(); +CREATE TABLE 03572_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; + +ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3 +SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} + +-- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation +-- passes and execution proceeds to the part lookup, which fails on the fake part name. +ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3 +SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART} + +-- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema +-- validation passes and we reach the part lookup, which fails on the fake part name. +CREATE TABLE 03572_lossless_mt (id Int32, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple(); +CREATE TABLE 03572_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; + +ALTER TABLE 03572_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossless_s3 +SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART} + +DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3, 03572_lossy_mt, 03572_lossy_s3, 03572_lossless_mt, 03572_lossless_s3; diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql index 874b5f288cf7..e20fc14a26a5 100644 --- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql +++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql @@ -1,6 +1,6 @@ -- Tags: no-parallel, no-fasttest -DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3; +DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3, 03572_rmt_lossy_mt, 03572_rmt_lossy_s3, 03572_rmt_lossless_mt, 03572_rmt_lossless_s3; CREATE TABLE 03572_rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt/03572_rmt_table', 'replica1') PARTITION BY year ORDER BY tuple(); @@ -21,14 +21,35 @@ CREATE TABLE 03572_invalid_schema_table (id UInt64, year UInt16) ENGINE = S3(s3_ ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NOT_IMPLEMENTED} --- Castable but non-equal partition-column types must be rejected synchronously. --- The partition path is computed from the source part's minmax block (SOURCE types) --- BEFORE row data is CAST to destination types — letting String -> UInt16 through --- would cause silent divergence between partition path and row values. +-- Partition columns follow the same lossy-cast gate as any other column (no special +-- exact-type guard). String -> UInt16 is a lossy cast, so with the default +-- export_merge_tree_part_allow_lossy_cast = 0 it is rejected synchronously. CREATE TABLE 03572_rmt_partition_type_mismatch_mt (id UInt64, year String) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_pcol_type/03572_rmt_partition_type_mismatch_mt', 'replica1') PARTITION BY year ORDER BY tuple(); CREATE TABLE 03572_rmt_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_partition_type_mismatch_s3 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} -DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3; +-- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values. +-- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a +-- fake part name is enough to provoke it. +CREATE TABLE 03572_rmt_lossy_mt (id Int64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossy/03572_rmt_lossy_mt', 'replica1') PARTITION BY year ORDER BY tuple(); +CREATE TABLE 03572_rmt_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; + +ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3 +SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} + +-- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation +-- passes and execution proceeds to the part lookup, which fails on the fake part name. +ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3 +SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART} + +-- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema +-- validation passes and we reach the part lookup, which fails on the fake part name. +CREATE TABLE 03572_rmt_lossless_mt (id Int32, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossless/03572_rmt_lossless_mt', 'replica1') PARTITION BY year ORDER BY tuple(); +CREATE TABLE 03572_rmt_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; + +ALTER TABLE 03572_rmt_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossless_s3 +SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART} + +DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3, 03572_rmt_lossy_mt, 03572_rmt_lossy_s3, 03572_rmt_lossless_mt, 03572_rmt_lossless_s3; From 8d2b0862604dd7fca778d8de5a598fbe92d9ce5f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 15 Jun 2026 14:42:27 -0300 Subject: [PATCH 07/33] fix tests and background setting --- .../MergeTree/ExportPartitionUtils.cpp | 3 ++ .../test.py | 47 +++++++++---------- .../test.py | 41 +++++++--------- 3 files changed, 41 insertions(+), 50 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp index ef51b10b9c0a..c902a4042218 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.cpp +++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp @@ -102,6 +102,9 @@ namespace ExportPartitionUtils /// stalls when the setting is only set at the query level. context_copy->setSetting("allow_insert_into_iceberg", true); + /// This setting, just like allow_insert_into_iceberg, has been verified at request time. At this point, we allow everything. + context_copy->setSetting("export_merge_tree_part_allow_lossy_cast", true); + return context_copy; } diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py index f51dfa3c602e..4a8514ef3ae1 100644 --- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py +++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py @@ -69,11 +69,15 @@ def get_part(node, table: str, partition_id: str) -> str: ).strip() -def export_part(node, table: str, part: str, dest: str) -> None: +def export_part(node, table: str, part: str, dest: str, extra_settings: str = "") -> None: + settings = ( + "allow_experimental_export_merge_tree_part = 1, " + "allow_experimental_insert_into_iceberg = 1" + ) + if extra_settings: + settings += ", " + extra_settings node.query( - f"ALTER TABLE {table} EXPORT PART '{part}' TO TABLE {dest} " - f"SETTINGS allow_experimental_export_merge_tree_part = 1, " - f"allow_experimental_insert_into_iceberg = 1" + f"ALTER TABLE {table} EXPORT PART '{part}' TO TABLE {dest} SETTINGS {settings}" ) @@ -560,10 +564,8 @@ def test_export_part_writes_column_statistics(cluster): # synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing lands. # * Destination column names need not match source names — positional pairs # are CAST element-wise. -# * Castable type differences are accepted at validation time (widening and -# even lossy narrowing). Lossiness only matters at runtime: if the value -# does not fit the destination type, the async export worker fails and the -# failure surfaces via system.part_log; the Iceberg table is untouched. +# * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at +# validation time unless export_merge_tree_part_allow_lossy_cast = 1. # --------------------------------------------------------------------------- @@ -701,10 +703,8 @@ def test_export_part_with_castable_widening(cluster): def test_export_part_with_castable_narrowing_values_fit(cluster): - """ - Source column is Int64, destination expects Int32 — lossy in general but - not blocked, mirroring INSERT SELECT. - """ + """A lossy narrowing (id Int64 -> Int32) succeeds once the user opts in via + export_merge_tree_part_allow_lossy_cast.""" node = cluster.instances["node1"] sfx = unique_suffix() mt = f"mt_narrow_fit_{sfx}" @@ -716,7 +716,7 @@ def test_export_part_with_castable_narrowing_values_fit(cluster): node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020)") part_2020 = get_part(node, mt, "2020") - export_part(node, mt, part_2020, iceberg) + export_part(node, mt, part_2020, iceberg, "export_merge_tree_part_allow_lossy_cast = 1") wait_for_export_part(node, mt, part_2020) count = int(node.query(f"SELECT count() FROM {iceberg}").strip()) @@ -736,18 +736,13 @@ def test_export_part_with_castable_narrowing_values_fit(cluster): def test_export_part_runtime_cast_failure_propagates_async(cluster): - """ - Source has a String column whose value cannot be parsed as the destination - Int32 column. Validation accepts the type pair (String -> Int32 is a - defined conversion) so the ALTER returns synchronously; the actual cast - runs in the async export worker and must fail there. The failure is - propagated to system.part_log (non-zero `error` column with a non-empty - `exception` text) and the Iceberg table is left empty. - - (Integer narrowing overflow is not used here because the internal cast - `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps - on overflow rather than throwing — that would still produce a successful - export with garbage data, not a propagated failure.) + """A String value that cannot be parsed as the destination Int32 passes the + synchronous lossy-cast gate (with export_merge_tree_part_allow_lossy_cast = 1) but + fails at runtime in the async worker; the failure surfaces in system.part_log and + Iceberg is left empty. + + (Integer overflow is not used because the internal cast uses CastType::nonAccurate, + which wraps rather than throwing.) """ node = cluster.instances["node1"] sfx = unique_suffix() @@ -760,7 +755,7 @@ def test_export_part_runtime_cast_failure_propagates_async(cluster): node.query(f"INSERT INTO {mt} VALUES ('not a number', 2020)") part_2020 = get_part(node, mt, "2020") - export_part(node, mt, part_2020, iceberg) + export_part(node, mt, part_2020, iceberg, "export_merge_tree_part_allow_lossy_cast = 1") exception = wait_for_failed_export_part(node, mt, part_2020) assert exception, ( diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py index 6bd080bc72f0..8e3f055c4317 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py @@ -1033,11 +1033,8 @@ def test_export_partition_writes_column_statistics(cluster): # synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing is scheduled. # * Destination column names need not match source names — positional pairs # are CAST element-wise. -# * Castable type differences are accepted at validation time (widening and -# even lossy narrowing). Lossiness only matters at runtime: if the value -# does not fit the destination type, the async export worker fails and the -# export is marked FAILED in system.replicated_partition_exports; the Iceberg -# table is left untouched. +# * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at +# validation time unless export_merge_tree_part_allow_lossy_cast = 1. # --------------------------------------------------------------------------- @@ -1200,10 +1197,8 @@ def test_export_partition_with_castable_widening(cluster): def test_export_partition_with_castable_narrowing_values_fit(cluster): - """ - Source column is Int64, destination expects Int32 — lossy in general but - not blocked, mirroring INSERT SELECT. - """ + """A lossy narrowing (id Int64 -> Int32) succeeds once the user opts in via + export_merge_tree_part_allow_lossy_cast.""" node = cluster.instances["replica1"] uid = unique_suffix() @@ -1217,7 +1212,10 @@ def test_export_partition_with_castable_narrowing_values_fit(cluster): node.query( f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", - settings={"allow_insert_into_iceberg": 1}, + settings={ + "allow_insert_into_iceberg": 1, + "export_merge_tree_part_allow_lossy_cast": 1, + }, ) wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED") @@ -1233,19 +1231,13 @@ def test_export_partition_with_castable_narrowing_values_fit(cluster): def test_export_partition_runtime_cast_failure_propagates_async(cluster): - """ - Source has a String column whose value cannot be parsed as the destination - Int32 column. Validation accepts the type pair (String -> Int32 is a - defined conversion) so the ALTER returns synchronously; the actual cast - runs in the async export worker and must fail there. The failure exhausts - the retry budget and the export is marked FAILED in - system.replicated_partition_exports with a non-zero exception_count, while - the Iceberg table is left empty. - - (Integer narrowing overflow is not used here because the internal cast - `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps - on overflow rather than throwing — that would still produce a successful - export with garbage data, not a propagated failure.) + """A String value that cannot be parsed as the destination Int32 passes the + synchronous lossy-cast gate (with export_merge_tree_part_allow_lossy_cast = 1) but + fails at runtime in the async worker, marking the export FAILED and leaving Iceberg + empty. + + (Integer overflow is not used because the internal cast uses CastType::nonAccurate, + which wraps rather than throwing.) """ node = cluster.instances["replica1"] @@ -1260,7 +1252,8 @@ def test_export_partition_runtime_cast_failure_propagates_async(cluster): node.query( f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} " - f"SETTINGS export_merge_tree_partition_max_retries = 1, allow_insert_into_iceberg = 1" + f"SETTINGS export_merge_tree_partition_max_retries = 1, allow_insert_into_iceberg = 1, " + f"export_merge_tree_part_allow_lossy_cast = 1" ) wait_for_export_status(node, mt_table, iceberg_table, "2020", "FAILED", timeout=60) From 4100ecd322666697c5c6685ded4e53f1588091f9 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 15 Jun 2026 15:45:22 -0300 Subject: [PATCH 08/33] fix tests --- ...rge_tree_part_to_object_storage_simple.sql | 21 +++++++------------ ...rge_tree_part_to_object_storage_simple.sql | 20 ++++++++---------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql index e40c5ebde461..8200233a7322 100644 --- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql +++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql @@ -45,31 +45,26 @@ ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_mat CREATE TABLE 03572_partition_type_mismatch_mt (id UInt64, year String) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple(); CREATE TABLE 03572_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; --- The lossy-cast check fires synchronously BEFORE the part lookup, --- so we don't need to insert data nor use a real part name to provoke it. -ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_partition_type_mismatch_s3 +ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_partition_type_mismatch_s3 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} --- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values. --- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a --- fake part name is enough to provoke it. CREATE TABLE 03572_lossy_mt (id Int64, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple(); CREATE TABLE 03572_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; -ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3 +ALTER TABLE 03572_lossy_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_lossy_s3 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} --- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation --- passes and execution proceeds to the part lookup, which fails on the fake part name. -ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3 +-- With the acknowledgment setting enabled, the lossy cast passes validation and reaches the +-- part lookup, which fails because the part does not exist. +ALTER TABLE 03572_lossy_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_lossy_s3 SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART} --- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema --- validation passes and we reach the part lookup, which fails on the fake part name. +-- A lossless widening cast (Int32 -> Int64) passes validation without the setting and reaches +-- the part lookup, which fails because the part does not exist. CREATE TABLE 03572_lossless_mt (id Int32, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple(); CREATE TABLE 03572_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; -ALTER TABLE 03572_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossless_s3 +ALTER TABLE 03572_lossless_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_lossless_s3 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART} DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3, 03572_lossy_mt, 03572_lossy_s3, 03572_lossless_mt, 03572_lossless_s3; diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql index e20fc14a26a5..0a199755c40a 100644 --- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql +++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql @@ -27,29 +27,27 @@ ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_sche CREATE TABLE 03572_rmt_partition_type_mismatch_mt (id UInt64, year String) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_pcol_type/03572_rmt_partition_type_mismatch_mt', 'replica1') PARTITION BY year ORDER BY tuple(); CREATE TABLE 03572_rmt_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; -ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_partition_type_mismatch_s3 +ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_partition_type_mismatch_s3 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} --- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values. --- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a --- fake part name is enough to provoke it. +-- A lossy cast on a non-partition column (Int64 -> Int32) is rejected synchronously by default. CREATE TABLE 03572_rmt_lossy_mt (id Int64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossy/03572_rmt_lossy_mt', 'replica1') PARTITION BY year ORDER BY tuple(); CREATE TABLE 03572_rmt_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; -ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3 +ALTER TABLE 03572_rmt_lossy_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_lossy_s3 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS} --- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation --- passes and execution proceeds to the part lookup, which fails on the fake part name. -ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3 +-- With the acknowledgment setting enabled, the lossy cast passes validation and reaches the +-- part lookup, which fails because the part does not exist. +ALTER TABLE 03572_rmt_lossy_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_lossy_s3 SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART} --- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema --- validation passes and we reach the part lookup, which fails on the fake part name. +-- A lossless widening cast (Int32 -> Int64) passes validation without the setting and reaches +-- the part lookup, which fails because the part does not exist. CREATE TABLE 03572_rmt_lossless_mt (id Int32, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossless/03572_rmt_lossless_mt', 'replica1') PARTITION BY year ORDER BY tuple(); CREATE TABLE 03572_rmt_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year; -ALTER TABLE 03572_rmt_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossless_s3 +ALTER TABLE 03572_rmt_lossless_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_lossless_s3 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART} DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3, 03572_rmt_lossy_mt, 03572_rmt_lossy_s3, 03572_rmt_lossless_mt, 03572_rmt_lossless_s3; From 914de7283facdfb4c456807b26403ccd76c1a578 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 15 Jun 2026 17:51:54 -0300 Subject: [PATCH 09/33] some more test fixes --- .../test.py | 16 -- .../test.py | 246 +++++++++++++----- 2 files changed, 185 insertions(+), 77 deletions(-) diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py index 4a8514ef3ae1..486adf1f2b17 100644 --- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py +++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py @@ -553,22 +553,6 @@ def test_export_part_writes_column_statistics(cluster): node.query(f"DROP TABLE IF EXISTS {iceberg}") -# --------------------------------------------------------------------------- -# Schema compatibility (INSERT SELECT mirror) -# -# EXPORT PART builds an ActionsDAG via makeConvertingActions(Position) — the -# same primitive that powers INSERT INTO dest SELECT * FROM src. The tests -# below pin that contract: -# -# * Column counts must match positionally; otherwise the ALTER is rejected -# synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing lands. -# * Destination column names need not match source names — positional pairs -# are CAST element-wise. -# * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at -# validation time unless export_merge_tree_part_allow_lossy_cast = 1. -# --------------------------------------------------------------------------- - - def test_export_part_column_count_mismatch_source_more_is_rejected(cluster): """ Source has 3 columns (id, year, extra), destination has 2 (id, year). diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py index 8e3f055c4317..09cf1c2ff467 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py @@ -746,40 +746,6 @@ def test_partition_key_compatibility_check(cluster): ) -def test_export_partition_partition_column_lossless_widening(cluster): - """A lossless widening of a partition column (year Int32 -> Int64) round-trips.""" - node = cluster.instances["replica1"] - - uid = unique_suffix() - mt_table = f"mt_pcol_widening_{uid}" - iceberg_table = f"iceberg_pcol_widening_{uid}" - - # Source uses year Int32; destination Iceberg uses year Int64. - make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1") - node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020), (3, 2020)") - - make_iceberg_s3( - node, iceberg_table, "id Int64, year Int64", - partition_by="year", s3_retry_attempts=3, - ) - - node.query( - f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", - settings={"allow_insert_into_iceberg": 1}, - ) - wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED") - - count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) - assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}" - - result = node.query( - f"SELECT id, toTypeName(year), year FROM {iceberg_table} ORDER BY id" - ).strip() - assert result == "1\tInt64\t2020\n2\tInt64\t2020\n3\tInt64\t2020", ( - f"Unexpected widened partition-column data:\n{result}" - ) - - def test_export_ttl(cluster): """ After a manifest TTL expires the same partition can be re-exported, and the @@ -1020,24 +986,6 @@ def test_export_partition_writes_column_statistics(cluster): assert_exported_stats(entries) -# --------------------------------------------------------------------------- -# Schema compatibility (INSERT SELECT mirror) -# -# EXPORT PARTITION builds an ActionsDAG via makeConvertingActions(Position) — the -# same primitive that powers INSERT INTO dest SELECT * FROM src. The destination -# compatibility validation in StorageReplicatedMergeTree::exportPartitionToTable -# mirrors what the plain-MergeTree EXPORT PART path does in MergeTreeData, so the -# tests below pin the same contract for the replicated partition export: -# -# * Column counts must match positionally; otherwise the ALTER is rejected -# synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing is scheduled. -# * Destination column names need not match source names — positional pairs -# are CAST element-wise. -# * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at -# validation time unless export_merge_tree_part_allow_lossy_cast = 1. -# --------------------------------------------------------------------------- - - def test_export_partition_column_count_mismatch_source_more_is_rejected(cluster): """ Source has 3 columns (id, year, extra), destination has 2 (id, year). @@ -1162,12 +1110,8 @@ def test_export_partition_with_renamed_destination_column(cluster): def test_export_partition_with_castable_widening(cluster): - """ - Source column is Int32, destination expects Int64. makeConvertingActions - inserts a widening CAST; the export must succeed and the values must land - as Int64 in Iceberg. The partition column `year` keeps the same Int32 type - on both sides so the exact-type partition check is satisfied. - """ + """A lossless widening of both a data column (id Int32 -> Int64) and the + partition column (year Int32 -> Int64) round-trips.""" node = cluster.instances["replica1"] uid = unique_suffix() @@ -1177,7 +1121,7 @@ def test_export_partition_with_castable_widening(cluster): make_rmt(node, mt_table, "id Int32, year Int32", "year", replica_name="replica1") node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020)") - make_iceberg_s3(node, iceberg_table, "id Int64, year Int32", partition_by="year") + make_iceberg_s3(node, iceberg_table, "id Int64, year Int64", partition_by="year") node.query( f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", @@ -1189,9 +1133,9 @@ def test_export_partition_with_castable_widening(cluster): assert count == 2, f"Expected 2 rows in Iceberg table after export, got {count}" result = node.query( - f"SELECT id, toTypeName(id), year FROM {iceberg_table} ORDER BY id" + f"SELECT id, toTypeName(id), year, toTypeName(year) FROM {iceberg_table} ORDER BY id" ).strip() - assert result == "1\tInt64\t2020\n2\tInt64\t2020", ( + assert result == "1\tInt64\t2020\tInt64\n2\tInt64\t2020\tInt64", ( f"Unexpected widened data:\n{result}" ) @@ -1230,6 +1174,31 @@ def test_export_partition_with_castable_narrowing_values_fit(cluster): ) +def test_export_partition_lossy_cast_rejected_without_optin(cluster): + """A lossy narrowing (id Int64 -> Int32) is rejected synchronously with + BAD_ARGUMENTS unless export_merge_tree_part_allow_lossy_cast is set.""" + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_lossy_reject_{uid}" + iceberg_table = f"iceberg_lossy_reject_{uid}" + + make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1") + node.query(f"INSERT INTO {mt_table} VALUES (1, 2020)") + + make_iceberg_s3(node, iceberg_table, "id Int32, year Int32", partition_by="year") + + error = node.query_and_get_error( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} " + f"SETTINGS allow_insert_into_iceberg = 1" + ) + assert "BAD_ARGUMENTS" in error, f"Expected BAD_ARGUMENTS, got: {error!r}" + assert "lossy cast" in error, f"Expected 'lossy cast' in error, got: {error!r}" + + count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) + assert count == 0, f"Expected no rows after a rejected export, got {count}" + + def test_export_partition_runtime_cast_failure_propagates_async(cluster): """A String value that cannot be parsed as the destination Int32 passes the synchronous lossy-cast gate (with export_merge_tree_part_allow_lossy_cast = 1) but @@ -1272,3 +1241,158 @@ def test_export_partition_runtime_cast_failure_propagates_async(cluster): assert count == 0, ( f"Expected 0 rows in Iceberg table after failed export, got {count}" ) + + +def test_export_partition_all_iceberg_types(cluster): + """Every getIcebergType-supported type round-trips through an EXPORT PARTITION: + scalars use narrower source types (explicit lossless widening CASTs), plus + Array/Map/Tuple nested columns.""" + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_all_types_{uid}" + iceberg_table = f"iceberg_all_types_{uid}" + + # Scalar source types are strictly narrower than the destination; the export inserts + # a positional widening CAST per column (Int8->Int16, UInt32->UInt64, ...). Nested + # columns keep the same type on both sides. + source_columns = ( + "i16 Int8, u16 UInt8, u32 UInt16, u64 UInt32, " + "id Int16, big Int32, f32 Float32, f64 Float64, " + "d Date, d32 Date32, dt DateTime, dt64 DateTime64(6), " + "s String, uid UUID, " + "arr Array(Int32), m Map(String, Int64), tup Tuple(a Int32, b String), " + "year Int32" + ) + dest_columns = ( + "i16 Int16, u16 UInt16, u32 UInt32, u64 UInt64, " + "id Int32, big Int64, f32 Float32, f64 Float64, " + "d Date, d32 Date32, dt DateTime, dt64 DateTime64(6), " + "s String, uid UUID, " + "arr Array(Int32), m Map(String, Int64), tup Tuple(a Int32, b String), " + "year Int32" + ) + + make_rmt(node, mt_table, source_columns, "year", replica_name="replica1") + make_iceberg_s3(node, iceberg_table, dest_columns, partition_by="year") + + node.query( + f""" + INSERT INTO {mt_table} + (i16, u16, u32, u64, id, big, f32, f64, d, d32, dt, dt64, s, uid, arr, m, tup, year) + VALUES ( + -100, 200, 50000, 4000000000, + 12345, 1000000000, 3.14, 2.718281828459045, + '2024-01-15', '2024-01-15', '2024-01-15 12:30:45', '2024-01-15 12:30:45.123456', + 'hello iceberg', '550e8400-e29b-41d4-a716-446655440000', + [1, 2, 3], {{'a': 10, 'b': 20}}, (7, 'seven'), 2024 + ) + """ + ) + + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2024' TO TABLE {iceberg_table}", + settings={"allow_insert_into_iceberg": 1}, + ) + wait_for_export_status(node, mt_table, iceberg_table, "2024", "COMPLETED") + + count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) + assert count == 1, f"Expected 1 row in Iceberg table, got {count}" + + result = node.query( + f""" + SELECT + i16, u16, u32, u64, id, big, + toString(d), toString(d32), toString(dt), + s, toString(uid), + arr, m['a'], m['b'], tup.a, tup.b, year + FROM {iceberg_table} + """ + ).strip() + expected = "\t".join([ + "-100", "200", "50000", "4000000000", + "12345", "1000000000", + "2024-01-15", "2024-01-15", "2024-01-15 12:30:45.000000", + "hello iceberg", "550e8400-e29b-41d4-a716-446655440000", + "[1,2,3]", "10", "20", "7", "seven", "2024", + ]) + assert result == expected, f"Unexpected round-trip data:\n{result!r}\nexpected:\n{expected!r}" + + # Floats compared with a tolerance to avoid formatting flakiness. + floats_ok = node.query( + f"SELECT abs(f32 - 3.14) < 1e-4 AND abs(f64 - 2.718281828459045) < 1e-12 FROM {iceberg_table}" + ).strip() + assert floats_ok == "1", f"Float round-trip outside tolerance: {floats_ok!r}" + + # DateTime64 sub-second component: assert the date part is preserved (exact format varies). + ts_result = node.query(f"SELECT dt64 FROM {iceberg_table}").strip() + assert "2024-01-15" in ts_result, f"DateTime64 date component missing: {ts_result!r}" + + +def test_export_partition_all_iceberg_types_lossy(cluster): + """Lossy narrowing casts across types succeed with the opt-in flag: values that + fit round-trip, Float64 -> Float32 loses precision, and Nullable columns carry + both NULL and non-NULL (the latter via a lossy Nullable(Int64) -> Nullable(Int32)).""" + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_lossy_types_{uid}" + iceberg_table = f"iceberg_lossy_types_{uid}" + + # Each source column is wider than the destination, so the export inserts a lossy + # narrowing CAST (allowed only because export_merge_tree_part_allow_lossy_cast=1). + # Int8/UInt8 are not Iceberg-representable, so the narrowest integer dest is Int16. + source_columns = ( + "big Int64, ubig UInt64, mid Int32, " + "f Float64, dt DateTime64(6), d Date32, " + "opt_s Nullable(String), opt_i Nullable(Int64), year Int32" + ) + dest_columns = ( + "big Int32, ubig UInt32, mid Int16, " + "f Float32, dt DateTime, d Date, " + "opt_s Nullable(String), opt_i Nullable(Int32), year Int32" + ) + + make_rmt(node, mt_table, source_columns, "year", replica_name="replica1") + make_iceberg_s3(node, iceberg_table, dest_columns, partition_by="year") + + # Values chosen to fit the destination types (the async cast wraps on overflow + # rather than throwing, so out-of-range values would silently corrupt instead). + # opt_s is NULL and opt_i is set, covering both nullable paths in one row. + node.query( + f""" + INSERT INTO {mt_table} (big, ubig, mid, f, dt, d, opt_s, opt_i, year) + VALUES ( + 1000000, 2000000000, 30000, + 2.718281828459045, '2024-01-15 12:30:45.123456', '2024-01-15', + NULL, 100, 2024 + ) + """ + ) + + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2024' TO TABLE {iceberg_table}", + settings={ + "allow_insert_into_iceberg": 1, + "export_merge_tree_part_allow_lossy_cast": 1, + }, + ) + wait_for_export_status(node, mt_table, iceberg_table, "2024", "COMPLETED") + + count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) + assert count == 1, f"Expected 1 row in Iceberg table, got {count}" + + result = node.query( + f"SELECT big, ubig, mid, toString(d), toString(dt), opt_s, opt_i, year FROM {iceberg_table}" + ).strip() + expected = "\t".join([ + "1000000", "2000000000", "30000", + "2024-01-15", "2024-01-15 12:30:45.000000", "\\N", "100", "2024", + ]) + assert result == expected, f"Unexpected lossy round-trip data:\n{result!r}\nexpected:\n{expected!r}" + + # Float64 -> Float32 stays within Float32 precision but is no longer exact. + f_checks = node.query( + f"SELECT abs(f - 2.718281828459045) < 1e-6, abs(f - 2.718281828459045) > 1e-9 FROM {iceberg_table}" + ).strip() + assert f_checks == "1\t1", f"Expected Float32 precision loss within tolerance, got: {f_checks!r}" From 6f822c8074bcf9aeea7544024a5f226a429a1cfb Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 15 Jun 2026 17:55:35 -0300 Subject: [PATCH 10/33] add docs --- docs/en/antalya/part_export.md | 6 ++++++ docs/en/antalya/partition_export.md | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/docs/en/antalya/part_export.md b/docs/en/antalya/part_export.md index 03f9f479991b..b73efa277e2a 100644 --- a/docs/en/antalya/part_export.md +++ b/docs/en/antalya/part_export.md @@ -107,6 +107,12 @@ In case a table function is used as the destination, the schema can be omitted a - **Default**: `{part_name}_{checksum}` - **Description**: Pattern for the filename of the exported merge tree part. The `part_name` and `checksum` are calculated and replaced on the fly. Additional macros are supported. +### `export_merge_tree_part_allow_lossy_cast` (Optional) + +- **Type**: `Bool` +- **Default**: `false` +- **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead. + ## Examples diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md index 975915859482..9bd1dfcd16f1 100644 --- a/docs/en/antalya/partition_export.md +++ b/docs/en/antalya/partition_export.md @@ -111,6 +111,12 @@ Notes: - Enforcement is best-effort: actual kill latency is bounded by one manifest-updater poll cycle (~30s) plus ZooKeeper watch propagation. - Since both this timeout and `export_merge_tree_partition_manifest_ttl` are measured from `create_time`, keep `export_merge_tree_partition_manifest_ttl` greater than `export_merge_tree_partition_task_timeout_seconds` if you want the KILLED entry to remain visible in `system.replicated_partition_exports` after the timeout fires. +### `export_merge_tree_part_allow_lossy_cast` (Optional) + +- **Type**: `Bool` +- **Default**: `false` +- **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead. + ## Examples ### Basic Export to S3 From f521a95c87278262e89bae8c6ec15218c32ced00 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 16 Jun 2026 09:59:40 -0300 Subject: [PATCH 11/33] improve docs with a warning --- docs/en/antalya/part_export.md | 2 ++ docs/en/antalya/partition_export.md | 2 ++ src/Core/Settings.cpp | 2 ++ 3 files changed, 6 insertions(+) diff --git a/docs/en/antalya/part_export.md b/docs/en/antalya/part_export.md index b73efa277e2a..88e1c831a536 100644 --- a/docs/en/antalya/part_export.md +++ b/docs/en/antalya/part_export.md @@ -113,6 +113,8 @@ In case a table function is used as the destination, the schema can be omitted a - **Default**: `false` - **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead. + **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value. + ## Examples diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md index 9bd1dfcd16f1..5a5dd2918869 100644 --- a/docs/en/antalya/partition_export.md +++ b/docs/en/antalya/partition_export.md @@ -117,6 +117,8 @@ Notes: - **Default**: `false` - **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead. + **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value. + ## Examples ### Basic Export to S3 diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 6368ec01c1cb..5fcf971ecc56 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -7608,6 +7608,8 @@ Pattern for the filename of the exported merge tree part. The `part_name` and `c )", 0) \ DECLARE(Bool, export_merge_tree_part_allow_lossy_cast, false, R"( Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead. + +Warning: the partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Lossy casts on a partition column can therefore make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the metadata will still contain the original `Int64` values while the data files contain the truncated `Int32` values. )", 0) \ \ /* ####################################################### */ \ From 41c64eda6f35eeedb14601435f130e0ccad78993 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 16 Jun 2026 21:20:03 -0300 Subject: [PATCH 12/33] hmm... maybe incomplete, gotta continue it tomorrow --- docs/en/antalya/part_export.md | 4 +- docs/en/antalya/partition_export.md | 4 +- src/Core/Settings.cpp | 2 +- ...portReplicatedMergeTreePartitionManifest.h | 7 +++ src/Storages/IStorage.h | 10 ++-- src/Storages/MergeTree/ExportPartTask.cpp | 2 +- .../MergeTree/ExportPartitionUtils.cpp | 18 +++--- src/Storages/MergeTree/ExportPartitionUtils.h | 7 ++- .../DataLakes/IDataLakeMetadata.h | 2 +- .../DataLakes/Iceberg/IcebergMetadata.cpp | 58 ++++++++++++++++++- .../DataLakes/Iceberg/IcebergMetadata.h | 2 +- .../ObjectStorage/StorageObjectStorage.cpp | 5 +- src/Storages/StorageReplicatedMergeTree.cpp | 2 + 13 files changed, 98 insertions(+), 25 deletions(-) diff --git a/docs/en/antalya/part_export.md b/docs/en/antalya/part_export.md index 88e1c831a536..73d467c5d9b1 100644 --- a/docs/en/antalya/part_export.md +++ b/docs/en/antalya/part_export.md @@ -113,7 +113,9 @@ In case a table function is used as the destination, the schema can be omitted a - **Default**: `false` - **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead. - **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value. + When exporting to Apache Iceberg, the partition value written to the metadata is derived from the source partition columns by casting them to the destination partition-field types and applying the destination partition transform — the same computation the exported data files use. This keeps the Iceberg metadata consistent with the data files. + + **Warning:** A lossy cast on a partition column remains semantically truncating. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into a destination `Int32` partition column, both the data files and the Iceberg metadata will contain the truncated `Int32` value (they agree with each other, but the original `Int64` value is lost). Such casts require `export_merge_tree_part_allow_lossy_cast = 1`. ## Examples diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md index 5a5dd2918869..bb17a57f87cd 100644 --- a/docs/en/antalya/partition_export.md +++ b/docs/en/antalya/partition_export.md @@ -117,7 +117,9 @@ Notes: - **Default**: `false` - **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead. - **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value. + When exporting to Apache Iceberg, the partition value written to the metadata is derived from the source partition columns by casting them to the destination partition-field types and applying the destination partition transform — the same computation the exported data files use. This keeps the Iceberg metadata consistent with the data files. + + **Warning:** A lossy cast on a partition column remains semantically truncating. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into a destination `Int32` partition column, both the data files and the Iceberg metadata will contain the truncated `Int32` value (they agree with each other, but the original `Int64` value is lost). Such casts require `export_merge_tree_part_allow_lossy_cast = 1`. ## Examples diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 5fcf971ecc56..d905075c5b71 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -7609,7 +7609,7 @@ Pattern for the filename of the exported merge tree part. The `part_name` and `c DECLARE(Bool, export_merge_tree_part_allow_lossy_cast, false, R"( Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead. -Warning: the partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Lossy casts on a partition column can therefore make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the metadata will still contain the original `Int64` values while the data files contain the truncated `Int32` values. +When exporting to Apache Iceberg, the partition value written to the metadata is derived from the source partition columns by casting them to the destination partition-field types and applying the destination partition transform — the same computation the exported data files use, so the metadata stays consistent with the data. A lossy cast on a partition column remains semantically truncating: both the data files and the metadata contain the truncated value, and such casts require this setting to be enabled. )", 0) \ \ /* ####################################################### */ \ diff --git a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h index acfabc28ca61..db26836bdb2e 100644 --- a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h +++ b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h @@ -174,6 +174,7 @@ struct ExportReplicatedMergeTreePartitionManifest MergeTreePartExportManifest::FileAlreadyExistsPolicy file_already_exists_policy; String filename_pattern; bool write_full_path_in_iceberg_metadata = false; + bool allow_lossy_cast = false; String iceberg_metadata_json; std::string toJsonString() const @@ -208,6 +209,7 @@ struct ExportReplicatedMergeTreePartitionManifest json.set("ttl_seconds", ttl_seconds); json.set("task_timeout_seconds", task_timeout_seconds); json.set("write_full_path_in_iceberg_metadata", write_full_path_in_iceberg_metadata); + json.set("allow_lossy_cast", allow_lossy_cast); std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM oss.exceptions(std::ios::failbit); Poco::JSON::Stringifier::stringify(json, oss); @@ -262,6 +264,11 @@ struct ExportReplicatedMergeTreePartitionManifest manifest.write_full_path_in_iceberg_metadata = json->getValue("write_full_path_in_iceberg_metadata"); + /// Default to true for tasks created before this field existed, so an in-flight + /// export scheduled with the old permissive worker behavior is not wrongly rejected + /// on upgrade. New tasks always persist the initiator's actual choice. + manifest.allow_lossy_cast = json->has("allow_lossy_cast") ? json->getValue("allow_lossy_cast") : true; + return manifest; } }; diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 0a07b892cef8..93caf3f65ebe 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -504,11 +504,11 @@ It is currently only implemented in StorageObjectStorage. struct IcebergCommitExportPartitionArguments { std::string metadata_json_string; - /// Partition column values (after transforms). Callers are responsible for - /// populating this: the partition-export path parses them from the persisted - /// JSON string, while the direct EXPORT PART path reads them from the part's - /// partition key. - std::vector partition_values; + /// Representative source partition-key columns from one exported part (the part's + /// minmax block). The destination derives the Iceberg partition tuple from a row of + /// this block by casting to the destination column types and applying the partition + /// transform, so the metadata partition value matches the exported data files. + Block partition_source_block; }; virtual void commitExportPartitionTransaction( diff --git a/src/Storages/MergeTree/ExportPartTask.cpp b/src/Storages/MergeTree/ExportPartTask.cpp index f2035577bbf6..e626e7364eaa 100644 --- a/src/Storages/MergeTree/ExportPartTask.cpp +++ b/src/Storages/MergeTree/ExportPartTask.cpp @@ -332,7 +332,7 @@ bool ExportPartTask::executeStep() { IStorage::IcebergCommitExportPartitionArguments iceberg_args; iceberg_args.metadata_json_string = manifest.iceberg_metadata_json; - iceberg_args.partition_values = manifest.data_part->partition.value; + iceberg_args.partition_source_block = block_with_partition_values; destination_storage->commitExportPartitionTransaction( manifest.transaction_id, diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp index c902a4042218..679e07d0b132 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.cpp +++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp @@ -57,14 +57,13 @@ namespace fs = std::filesystem; namespace ExportPartitionUtils { - std::vector getPartitionValuesForIcebergCommit( + Block getPartitionSourceBlockForIcebergCommit( MergeTreeData & storage, const String & partition_id) { auto lock = storage.readLockParts(); const auto parts = storage.getDataPartsVectorInPartitionForInternalUsage( MergeTreeDataPartState::Active, partition_id, lock); - - /// todo arthur: bad arguments for now, pick a better one + if (parts.empty()) throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "Cannot find active part for partition_id '{}' to derive Iceberg partition " @@ -72,7 +71,7 @@ namespace ExportPartitionUtils "or this replica has not yet received any part for this partition. " "The commit will be retried.", partition_id); - return parts.front()->partition.value; + return parts.front()->minmax_idx->getBlock(storage); } ContextPtr getContextCopyWithTaskSettings(const ContextPtr & context, const ExportReplicatedMergeTreePartitionManifest & manifest) @@ -102,8 +101,11 @@ namespace ExportPartitionUtils /// stalls when the setting is only set at the query level. context_copy->setSetting("allow_insert_into_iceberg", true); - /// This setting, just like allow_insert_into_iceberg, has been verified at request time. At this point, we allow everything. - context_copy->setSetting("export_merge_tree_part_allow_lossy_cast", true); + /// Reapply the initiator's lossy-cast decision (persisted in the manifest) so the + /// worker's schema revalidation honors the user's choice. Without this, a task + /// scheduled without the opt-in could still apply a lossy cast if the destination + /// schema drifts to a lossy target between scheduling and execution. + context_copy->setSetting("export_merge_tree_part_allow_lossy_cast", manifest.allow_lossy_cast); return context_copy; } @@ -217,8 +219,8 @@ namespace ExportPartitionUtils { iceberg_args.metadata_json_string = manifest.iceberg_metadata_json; if (source_storage.getInMemoryMetadataPtr()->hasPartitionKey()) - iceberg_args.partition_values = - getPartitionValuesForIcebergCommit(source_storage, manifest.partition_id); + iceberg_args.partition_source_block = + getPartitionSourceBlockForIcebergCommit(source_storage, manifest.partition_id); } destination_storage->commitExportPartitionTransaction(manifest.transaction_id, manifest.partition_id, exported_paths, iceberg_args, context); diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h index d97bb538a47f..0434bc59a2cb 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.h +++ b/src/Storages/MergeTree/ExportPartitionUtils.h @@ -27,14 +27,15 @@ namespace ExportPartitionUtils ContextPtr getContextCopyWithTaskSettings(const ContextPtr & context, const ExportReplicatedMergeTreePartitionManifest & manifest); - /// Returns the partition key values for the given partition_id by reading from - /// the first active local part. Throws LOGICAL_ERROR if no such part is found. + /// Returns the representative source partition-key columns (the first active local part's + /// minmax block) for the given partition_id. The destination recomputes the Iceberg partition + /// tuple from this block by casting to its column types and applying the partition transform. /// /// Edge case: if the partition was dropped after export started, or this replica /// has not yet received any part for this partition (extreme replication lag on a /// recovery path), no active part will be found and the commit will fail. The task /// will be retried on the next poll cycle or picked up by a different replica. - std::vector getPartitionValuesForIcebergCommit( + Block getPartitionSourceBlockForIcebergCommit( MergeTreeData & storage, const String & partition_id); void commit( diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h index 85c91ba5de19..a90c49b35210 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h @@ -226,7 +226,7 @@ class IDataLakeMetadata : boost::noncopyable const String & /* transaction_id */, Int64 /* original_schema_id */, Int64 /* partition_spec_id */, - const std::vector & /* partition_values */, + const Block & /* partition_source_block */, SharedHeader /* sample_block */, const std::vector & /* data_file_paths */, ContextPtr /* context */) diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp index 6016be1cd341..9819a8783516 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -1524,6 +1525,57 @@ Poco::JSON::Object::Ptr lookupSchema(const Poco::JSON::Object::Ptr & meta, Int64 "Schema with id {} not found in table metadata", schema_id); } +/// Derive the Iceberg partition tuple for an exported part from a representative source row. +/// The MergeTree `partition.value` is the source partition-key expression result; it is neither +/// cast to the destination column types nor expressed through the Iceberg transform, so it must +/// not be written to metadata directly. Within a MergeTree partition the transform result is +/// constant, so a single representative value per partition-source column (taken from the part's +/// minmax block) suffices: cast it to the destination column type and run the same transform the +/// data uses. The result is transform-correct and consistent with the exported data files. +std::vector recomputeExportPartitionValues( + ChunkPartitioner & partitioner, + const SharedHeader & sample_block, + const Block & partition_source_block) +{ + const auto & partition_columns = partitioner.getColumns(); + if (partition_columns.empty()) + return {}; + + for (const auto & column_name : partition_columns) + if (!partition_source_block.has(column_name)) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Partition source column '{}' required by the Iceberg partition transform is missing " + "from the representative source block while committing an export.", column_name); + + Columns columns; + columns.reserve(sample_block->columns()); + for (size_t i = 0; i < sample_block->columns(); ++i) + { + const auto & dest_column = sample_block->getByPosition(i); + if (partition_source_block.has(dest_column.name)) + { + const auto & source = partition_source_block.getByName(dest_column.name); + ColumnWithTypeAndName representative{source.column->cut(0, 1), source.type, source.name}; + columns.push_back(castColumn(representative, dest_column.type)); + } + else + { + auto column = dest_column.type->createColumn(); + column->insertDefault(); + columns.push_back(std::move(column)); + } + } + + auto partitioned = partitioner.partitionChunk(Chunk(std::move(columns), 1)); + if (partitioned.size() != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Recomputing Iceberg partition values produced {} partitions for a single representative row; " + "a MergeTree partition must map to exactly one Iceberg partition.", partitioned.size()); + + const auto & key = partitioned.front().first; + return std::vector(key.begin(), key.end()); +} + } bool IcebergMetadata::commitImportPartitionTransactionImpl( @@ -1826,7 +1878,7 @@ void IcebergMetadata::commitExportPartitionTransaction( const String & transaction_id, Int64 original_schema_id, Int64 partition_spec_id, - const std::vector & partition_values, + const Block & partition_source_block, SharedHeader sample_block, const std::vector & data_file_paths, ContextPtr context) @@ -1895,6 +1947,10 @@ void IcebergMetadata::commitExportPartitionTransaction( const auto partition_columns = partitioner.getColumns(); const auto partition_types = partitioner.getResultTypes(); + /// Recompute the partition tuple via the destination transform so the metadata partition + /// value matches the exported data (rather than the raw source MergeTree partition value). + const auto partition_values = recomputeExportPartitionValues(partitioner, sample_block, partition_source_block); + const auto metadata_compression_method = persistent_components.metadata_compression_method; FileNamesGenerator filename_generator = FileNamesGenerator( diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h index 2e28df93e0e6..2ec8efe4a9b7 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h @@ -141,7 +141,7 @@ class IcebergMetadata : public IDataLakeMetadata const String & transaction_id, Int64 original_schema_id, Int64 partition_spec_id, - const std::vector & partition_values, + const Block & partition_source_block, SharedHeader sample_block, const std::vector & data_file_paths, ContextPtr context) override; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index c08db678c46c..50f8c373767d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -680,7 +680,8 @@ void StorageObjectStorage::commitExportPartitionTransaction( /// Parse the Iceberg metadata snapshot (stored in ZooKeeper at export-start time) only to /// extract the schema-id and partition-spec-id that were current when the export began. /// partition_columns and partition_types are derived inside commitExportPartitionTransaction - /// from the same JSON, so only partition_values need to be carried here. + /// from the same JSON; the representative source partition columns are carried here so the + /// partition tuple can be recomputed through the destination transform. Poco::JSON::Parser iceberg_parser; Poco::JSON::Object::Ptr iceberg_metadata = iceberg_parser.parse(iceberg_commit_export_partition_arguments.metadata_json_string).extract(); @@ -695,7 +696,7 @@ void StorageObjectStorage::commitExportPartitionTransaction( transaction_id, original_schema_id, partition_spec_id, - iceberg_commit_export_partition_arguments.partition_values, + iceberg_commit_export_partition_arguments.partition_source_block, std::make_shared(getInMemoryMetadataPtr()->getSampleBlock()), exported_paths, local_context); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 2301ec1c1a30..10eee4e5db95 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -225,6 +225,7 @@ namespace Setting extern const SettingsUInt64 export_merge_tree_part_max_rows_per_file; extern const SettingsBool export_merge_tree_part_throw_on_pending_mutations; extern const SettingsBool export_merge_tree_part_throw_on_pending_patch_parts; + extern const SettingsBool export_merge_tree_part_allow_lossy_cast; extern const SettingsExportPartitionAllOnError export_merge_tree_partition_all_on_error; extern const SettingsString export_merge_tree_part_filename_pattern; extern const SettingsBool write_full_path_in_iceberg_metadata; @@ -8537,6 +8538,7 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & manifest.file_already_exists_policy = query_context->getSettingsRef()[Setting::export_merge_tree_part_file_already_exists_policy].value; manifest.filename_pattern = query_context->getSettingsRef()[Setting::export_merge_tree_part_filename_pattern].value; manifest.write_full_path_in_iceberg_metadata = query_context->getSettingsRef()[Setting::write_full_path_in_iceberg_metadata]; + manifest.allow_lossy_cast = query_context->getSettingsRef()[Setting::export_merge_tree_part_allow_lossy_cast]; if (dest_storage->isDataLake()) { From 3b177ed647fe9c74fabe99930d7b8e89c9c34985 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 17 Jun 2026 09:51:12 -0300 Subject: [PATCH 13/33] add tests covering corner tricky case --- .../test.py | 246 ++++++++++++++++++ 1 file changed, 246 insertions(+) diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py index 09cf1c2ff467..00cf5b5ff6f2 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py @@ -1396,3 +1396,249 @@ def test_export_partition_all_iceberg_types_lossy(cluster): f"SELECT abs(f - 2.718281828459045) < 1e-6, abs(f - 2.718281828459045) > 1e-9 FROM {iceberg_table}" ).strip() assert f_checks == "1\t1", f"Expected Float32 precision loss within tolerance, got: {f_checks!r}" + + +def _data_file_partition_records(entries): + """Partition dicts of the non-delete data files described by manifest entries.""" + records = [] + for entry in entries: + data_file = entry.get("data_file") or {} + if data_file.get("content", 0) not in (0, None): + continue + partition = data_file.get("partition") + if partition is not None: + records.append(partition) + return records + + +def _partition_scalar(partition, field): + """Read a partition field value, tolerating an Avro-union ``{type: value}`` wrapper.""" + value = partition.get(field) + if isinstance(value, dict): + assert len(value) == 1, f"Unexpected partition union shape for {field!r}: {value!r}" + value = next(iter(value.values())) + return value + + +def test_export_partition_bucket_transform_metadata_matches_data(cluster): + """A bucket[N] partition column whose type changes Int64 -> String records the + destination murmur(String) bucket in the Iceberg metadata, matching the exported + data rather than the source hashLong bucket.""" + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_bucket_xform_{uid}" + iceberg_table = f"iceberg_bucket_xform_{uid}" + + # N=16, key=42 diverges: icebergBucket(16, 42::Int64)=14 (source/old hashLong) but + # icebergBucket(16, '42')=6 (destination/new murmur over the exported String). + make_rmt(node, mt_table, "id Int64, key Int64", "icebergBucket(16, key)", + replica_name="replica1") + node.query(f"INSERT INTO {mt_table} VALUES (1, 42), (2, 42)") + + make_iceberg_s3(node, iceberg_table, "id Int64, key String", + partition_by="icebergBucket(16, key)") + + pid = first_partition_id(node, mt_table) + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}", + settings={"allow_insert_into_iceberg": 1}, + ) + wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED") + + count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) + assert count == 2, f"Expected 2 rows after export, got {count}" + + string_bucket = int(node.query( + f"SELECT DISTINCT icebergBucket(16, key) FROM {iceberg_table}" + ).strip()) + long_bucket = int(node.query( + f"SELECT DISTINCT icebergBucket(16, toInt64(key)) FROM {iceberg_table}" + ).strip()) + assert string_bucket != long_bucket, ( + f"Test setup invalid: String and Int64 buckets coincide ({string_bucket}); " + f"pick a different N/key so the transform diverges." + ) + + query_id = f"bucket_xform_{uid}" + node.query( + f"SELECT * FROM {iceberg_table}", + query_id=query_id, + settings={"iceberg_metadata_log_level": "manifest_file_entry"}, + ) + entries = fetch_manifest_entries(node, query_id) + partitions = _data_file_partition_records(entries) + assert partitions, "No data-file partition records found in manifest entries" + meta_values = {int(_partition_scalar(p, "key")) for p in partitions} + assert meta_values == {string_bucket}, ( + f"Metadata bucket {meta_values} must equal the destination String bucket " + f"{string_bucket} (not the source Int64 bucket {long_bucket})." + ) + + +def test_export_partition_month_transform_metadata_matches_data(cluster): + """A month-transform partition records a months-since-epoch value in metadata that + matches the value derived from the exported data, and a transform-filtered read + returns the rows.""" + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_month_xform_{uid}" + iceberg_table = f"iceberg_month_xform_{uid}" + + make_rmt(node, mt_table, "id Int64, event_date Date", + "toMonthNumSinceEpoch(event_date)", replica_name="replica1") + node.query( + f"INSERT INTO {mt_table} VALUES " + f"(1, '2024-03-05'), (2, '2024-03-20'), (3, '2024-03-31')" + ) + + make_iceberg_s3(node, iceberg_table, "id Int64, event_date Date", + partition_by="toMonthNumSinceEpoch(event_date)") + + pid = first_partition_id(node, mt_table) + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}", + settings={"allow_insert_into_iceberg": 1}, + ) + wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED") + + count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) + assert count == 3, f"Expected 3 rows after export, got {count}" + + month_num = int(node.query( + f"SELECT DISTINCT toMonthNumSinceEpoch(event_date) FROM {iceberg_table}" + ).strip()) + + query_id = f"month_xform_{uid}" + node.query( + f"SELECT * FROM {iceberg_table}", + query_id=query_id, + settings={"iceberg_metadata_log_level": "manifest_file_entry"}, + ) + entries = fetch_manifest_entries(node, query_id) + partitions = _data_file_partition_records(entries) + assert partitions, "No data-file partition records found in manifest entries" + meta_values = {int(_partition_scalar(p, "event_date")) for p in partitions} + assert meta_values == {month_num}, ( + f"Metadata month {meta_values} must equal toMonthNumSinceEpoch over the data " + f"({month_num})." + ) + + filtered = int(node.query( + f"SELECT count() FROM {iceberg_table} " + f"WHERE toMonthNumSinceEpoch(event_date) = {month_num}" + ).strip()) + assert filtered == 3, f"Transform-filtered read expected 3 rows, got {filtered}" + + +def test_export_partition_identity_type_change_metadata_matches_data(cluster): + """An identity partition column whose type changes UInt16 -> String records the + destination String value in the Iceberg metadata, matching the exported data.""" + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_identity_xform_{uid}" + iceberg_table = f"iceberg_identity_xform_{uid}" + + make_rmt(node, mt_table, "id Int32, year UInt16", "year", replica_name="replica1") + node.query(f"INSERT INTO {mt_table} VALUES (1, 2024), (2, 2024)") + + make_iceberg_s3(node, iceberg_table, "id Int32, year String", partition_by="year") + + pid = first_partition_id(node, mt_table) + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}", + settings={"allow_insert_into_iceberg": 1}, + ) + wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED") + + count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) + assert count == 2, f"Expected 2 rows after export, got {count}" + + data_year = node.query(f"SELECT DISTINCT year FROM {iceberg_table}").strip() + assert data_year == "2024", f"Expected exported year '2024' (String), got {data_year!r}" + + query_id = f"identity_xform_{uid}" + node.query( + f"SELECT * FROM {iceberg_table}", + query_id=query_id, + settings={"iceberg_metadata_log_level": "manifest_file_entry"}, + ) + entries = fetch_manifest_entries(node, query_id) + partitions = _data_file_partition_records(entries) + assert partitions, "No data-file partition records found in manifest entries" + meta_values = {str(_partition_scalar(p, "year")) for p in partitions} + assert meta_values == {"2024"}, ( + f"Metadata partition {meta_values} must equal the destination String value " + f"'2024' (not the source integer representation)." + ) + + +def test_export_partition_multicolumn_identity_metadata_matches_data(cluster): + """A multi-column identity partition (event_date Date, retention UInt64 -> Int64) + records per-column values in the Iceberg metadata that match the exported data.""" + node = cluster.instances["replica1"] + + uid = unique_suffix() + mt_table = f"mt_multicol_{uid}" + iceberg_table = f"iceberg_multicol_{uid}" + + # Iceberg has no unsigned types, so retention widens UInt64 -> Int64; the cast is + # not value-preserving per canBeSafelyCast, hence the lossy opt-in below. + make_rmt(node, mt_table, "id Int64, event_date Date, retention UInt64", + "(event_date, retention)", replica_name="replica1") + node.query( + f"INSERT INTO {mt_table} VALUES " + f"(1, '2024-03-05', 30), (2, '2024-03-05', 30), (3, '2024-03-05', 30)" + ) + + make_iceberg_s3(node, iceberg_table, "id Int64, event_date Date, retention Int64", + partition_by="(event_date, retention)") + + pid = first_partition_id(node, mt_table) + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}", + settings={ + "allow_insert_into_iceberg": 1, + "export_merge_tree_part_allow_lossy_cast": 1, + }, + ) + wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED") + + count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip()) + assert count == 3, f"Expected 3 rows after export, got {count}" + + data_retention = int(node.query( + f"SELECT DISTINCT retention FROM {iceberg_table}" + ).strip()) + assert data_retention == 30, f"Expected exported retention 30, got {data_retention}" + + days = int(node.query( + f"SELECT DISTINCT toInt64(event_date) FROM {iceberg_table}" + ).strip()) + + query_id = f"multicol_{uid}" + node.query( + f"SELECT * FROM {iceberg_table}", + query_id=query_id, + settings={"iceberg_metadata_log_level": "manifest_file_entry"}, + ) + entries = fetch_manifest_entries(node, query_id) + partitions = _data_file_partition_records(entries) + assert partitions, "No data-file partition records found in manifest entries" + + meta_dates = {int(_partition_scalar(p, "event_date")) for p in partitions} + assert meta_dates == {days}, ( + f"Metadata event_date {meta_dates} must equal days-since-epoch {days}." + ) + meta_retentions = {int(_partition_scalar(p, "retention")) for p in partitions} + assert meta_retentions == {30}, ( + f"Metadata retention {meta_retentions} must equal the exported value 30." + ) + + filtered = int(node.query( + f"SELECT count() FROM {iceberg_table} " + f"WHERE event_date = '2024-03-05' AND retention = 30" + ).strip()) + assert filtered == 3, f"Partition-filtered read expected 3 rows, got {filtered}" From 4a75d529c95b34897183bd4d110930ee97e73763 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 16 Jun 2026 17:09:41 -0300 Subject: [PATCH 14/33] rmv task ttl --- docs/en/antalya/partition_export.md | 9 +---- src/Core/Settings.cpp | 6 +-- ...portReplicatedMergeTreePartitionManifest.h | 3 -- .../ExportPartitionManifestUpdatingTask.cpp | 37 +++++------------- src/Storages/StorageReplicatedMergeTree.cpp | 33 ++-------------- .../test.py | 35 +++++++++-------- .../test.py | 39 ++++++++++--------- 7 files changed, 53 insertions(+), 109 deletions(-) diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md index bb17a57f87cd..95a2cfe36fb4 100644 --- a/docs/en/antalya/partition_export.md +++ b/docs/en/antalya/partition_export.md @@ -59,7 +59,7 @@ TO TABLE [destination_database.]destination_table - **Type**: `Bool` - **Default**: `false` -- **Description**: Ignore existing partition export and overwrite the ZooKeeper entry. Allows re-exporting a partition to the same destination before the manifest expires. **IMPORTANT:** this is dangerous because it can lead to duplicated data, use it with caution. +- **Description**: Ignore existing partition export and overwrite the ZooKeeper entry. Allows re-exporting a partition that was already exported to the same destination. **IMPORTANT:** this is dangerous because it can lead to duplicated data, use it with caution. #### `export_merge_tree_partition_max_retries` (Optional) @@ -67,12 +67,6 @@ TO TABLE [destination_database.]destination_table - **Default**: `3` - **Description**: Maximum number of retries for exporting a merge tree part in an export partition task. If it exceeds, the entire task fails. -#### `export_merge_tree_partition_manifest_ttl` (Optional) - -- **Type**: `UInt64` -- **Default**: `180` (seconds) -- **Description**: Determines how long the manifest will live in ZooKeeper. It prevents the same partition from being exported twice to the same destination. This setting does not affect or delete in-progress tasks; it only cleans up completed ones. - #### `export_merge_tree_part_file_already_exists_policy` (Optional) - **Type**: `MergeTreePartExportFileAlreadyExistsPolicy` @@ -109,7 +103,6 @@ When the timeout is exceeded the task transitions to KILLED (same terminal state Notes: - Enforcement is best-effort: actual kill latency is bounded by one manifest-updater poll cycle (~30s) plus ZooKeeper watch propagation. -- Since both this timeout and `export_merge_tree_partition_manifest_ttl` are measured from `create_time`, keep `export_merge_tree_partition_manifest_ttl` greater than `export_merge_tree_partition_task_timeout_seconds` if you want the KILLED entry to remain visible in `system.replicated_partition_exports` after the timeout fires. ### `export_merge_tree_part_allow_lossy_cast` (Optional) diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index d905075c5b71..7a9089880d96 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -7557,10 +7557,6 @@ Ignore existing partition export and overwrite the zookeeper entry )", 0) \ DECLARE(UInt64, export_merge_tree_partition_max_retries, 3, R"( Maximum number of retries for exporting a merge tree part in an export partition task -)", 0) \ - DECLARE(UInt64, export_merge_tree_partition_manifest_ttl, 86400, R"( -Determines how long the manifest will live in ZooKeeper. It prevents the same partition from being exported twice to the same destination. -This setting does not affect / delete in progress tasks. It'll only cleanup the completed ones. )", 0) \ DECLARE(UInt64, export_merge_tree_partition_task_timeout_seconds, 86400, R"( Maximum wall-clock duration (in seconds) an export partition task is allowed to remain in the PENDING state before it is auto-killed by the background cleanup loop. @@ -7573,7 +7569,6 @@ In such scenario, ClickHouse would attempt to commit those files again producing Notes: - Enforcement is best-effort: actual kill latency is bounded by one manifest-updater poll cycle (~30s) plus ZooKeeper watch propagation. -- Since both this timeout and `export_merge_tree_partition_manifest_ttl` are measured from `create_time`, keep `export_merge_tree_partition_manifest_ttl` greater than `export_merge_tree_partition_task_timeout_seconds` if you want the KILLED entry to remain visible in `system.replicated_partition_exports` after the timeout fires. )", 0) \ DECLARE(MergeTreePartExportFileAlreadyExistsPolicy, export_merge_tree_part_file_already_exists_policy, MergeTreePartExportFileAlreadyExistsPolicy::skip, R"( Possible values: @@ -7948,6 +7943,7 @@ Maximum number of WebAssembly UDF instances that can run in parallel per functio #define OBSOLETE_SETTINGS(M, ALIAS) \ /** Obsolete settings which are kept around for compatibility reasons. They have no effect anymore. */ \ + MAKE_OBSOLETE(M, UInt64, export_merge_tree_partition_manifest_ttl, 86400) \ MAKE_OBSOLETE(M, Bool, query_condition_cache_store_conditions_as_plaintext, false) \ MAKE_OBSOLETE(M, Bool, update_insert_deduplication_token_in_dependent_materialized_views, 0) \ MAKE_OBSOLETE(M, UInt64, max_memory_usage_for_all_queries, 0) \ diff --git a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h index db26836bdb2e..4e28e5e4f505 100644 --- a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h +++ b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h @@ -164,7 +164,6 @@ struct ExportReplicatedMergeTreePartitionManifest std::vector parts; time_t create_time; size_t max_retries; - size_t ttl_seconds; size_t task_timeout_seconds; size_t max_threads; bool parallel_formatting; @@ -206,7 +205,6 @@ struct ExportReplicatedMergeTreePartitionManifest json.set("filename_pattern", filename_pattern); json.set("create_time", create_time); json.set("max_retries", max_retries); - json.set("ttl_seconds", ttl_seconds); json.set("task_timeout_seconds", task_timeout_seconds); json.set("write_full_path_in_iceberg_metadata", write_full_path_in_iceberg_metadata); json.set("allow_lossy_cast", allow_lossy_cast); @@ -242,7 +240,6 @@ struct ExportReplicatedMergeTreePartitionManifest manifest.parts.push_back(parts_array->getElement(static_cast(i))); manifest.create_time = json->getValue("create_time"); - manifest.ttl_seconds = json->getValue("ttl_seconds"); manifest.task_timeout_seconds = json->getValue("task_timeout_seconds"); manifest.max_threads = json->getValue("max_threads"); manifest.parallel_formatting = json->getValue("parallel_formatting"); diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index b45f3667be87..6926ee77953a 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -128,20 +128,19 @@ namespace } /* - Remove expired entries and fix non-committed exports that have already exported all parts. + Enforce the PENDING task timeout and recover non-committed exports that have already + exported all parts. Entries are never removed for age — `system.replicated_partition_exports` + is append-only history. - Return values: - - true: the cleanup was successful, the entry is removed from the entries_by_key container and the function returns true. Proceed to the next entry. - - false: the cleanup was not successful, the entry is not removed from the entries_by_key container and the function returns false. + Always returns false: the entry stays in the in-memory container. A KILLED transition is + driven by the status watch, and a deferred commit is handled by the caller after the lock + is released. Side outputs: - `deferred_commits`: when a PENDING entry has all parts processed but the export was never committed, this function appends a CommitRecoveryWork item to be executed by the caller after releasing the storage-wide mutex. The actual commit() call (which performs network I/O to the destination catalog and S3) MUST NOT run under the lock. - The function still returns `false` in that case so the outer poll() loop falls through - to `addTask`, keeping the in-memory entry consistent regardless of whether the - deferred commit ultimately succeeds. */ bool tryCleanup( const zkutil::ZooKeeperPtr & zk, @@ -149,33 +148,17 @@ namespace const LoggerPtr & log, const ContextPtr & storage_context, StorageReplicatedMergeTree & storage, - const std::string & key, const ExportReplicatedMergeTreePartitionManifest & metadata, const time_t now, const bool is_pending, - auto & entries_by_key, std::vector & deferred_commits ) { - bool has_expired = metadata.create_time < now - static_cast(metadata.ttl_seconds); - bool task_timed_out = is_pending && metadata.task_timeout_seconds > 0 && metadata.create_time + static_cast(metadata.task_timeout_seconds) < now; - if (has_expired && !is_pending) - { - zk->tryRemoveRecursive(fs::path(entry_path)); - ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); - ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRemoveRecursive); - auto it = entries_by_key.find(key); - if (it != entries_by_key.end()) - entries_by_key.erase(it); - LOG_INFO(log, "ExportPartition Manifest Updating Task: Removed {}: expired", key); - - return true; - } - else if (task_timed_out) + if (task_timed_out) { const std::string status_path = fs::path(entry_path) / "status"; @@ -365,8 +348,8 @@ void ExportPartitionManifestUpdatingTask::poll() const std::string cleanup_lock_path = fs::path(storage.zookeeper_path) / "exports_cleanup_lock"; /// The `exports_cleanup_lock` is an ephemeral ZK node that serializes cleanup work - /// across replicas: only the replica holding it walks `tryCleanup` (entry expiry + - /// commit recovery). It MUST outlive the deferred-commit loop below; otherwise a peer + /// across replicas: only the replica holding it walks `tryCleanup` (task-timeout + /// enforcement + commit recovery). It MUST outlive the deferred-commit loop below; otherwise a peer /// replica's next poll() could acquire it and race us on the same commit-recovery work, /// duplicating REST-catalog round-trips and snapshot writes. The EphemeralNodeHolder /// destructor removes the node, so we declare it at function scope and let it die @@ -478,11 +461,9 @@ void ExportPartitionManifestUpdatingTask::poll() storage.log.load(), storage.getContext(), storage, - key, metadata, now, *status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING, - entries_by_key, deferred_commits); if (cleanup_successful) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 10eee4e5db95..046399f0bdfe 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -215,7 +215,6 @@ namespace Setting extern const SettingsBool allow_experimental_export_merge_tree_part; extern const SettingsBool export_merge_tree_partition_force_export; extern const SettingsUInt64 export_merge_tree_partition_max_retries; - extern const SettingsUInt64 export_merge_tree_partition_manifest_ttl; extern const SettingsUInt64 export_merge_tree_partition_task_timeout_seconds; extern const SettingsBool output_format_parallel_formatting; extern const SettingsBool output_format_parquet_parallel_encoding; @@ -8416,36 +8415,11 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperExists); if (zookeeper->exists(partition_exports_path)) { - LOG_INFO(log, "Export with key {} is already exported or it is being exported. Checking if it has expired so that we can overwrite it", export_key); + LOG_INFO(log, "Export with key {} is already exported or it is being exported", export_key); - bool has_expired = false; - - ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); - ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperExists); - if (zookeeper->exists(fs::path(partition_exports_path) / "metadata.json")) - { - std::string metadata_json; - ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); - ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGet); - if (zookeeper->tryGet(fs::path(partition_exports_path) / "metadata.json", metadata_json)) - { - const auto manifest = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); - - const auto now = time(nullptr); - const auto expiration_time = manifest.create_time + manifest.ttl_seconds; - - LOG_INFO(log, "Export with key {} has expiration time {}, now is {}", export_key, expiration_time, now); - - if (static_cast(expiration_time) < now) - { - has_expired = true; - } - } - } - - if (!has_expired && !query_context->getSettingsRef()[Setting::export_merge_tree_partition_force_export]) + if (!query_context->getSettingsRef()[Setting::export_merge_tree_partition_force_export]) { - throw Exception(ErrorCodes::EXPORT_PARTITION_ALREADY_EXPORTED, "Export with key {} already exported or it is being exported, and it has not expired. Set `export_merge_tree_partition_force_export` to overwrite it.", export_key); + throw Exception(ErrorCodes::EXPORT_PARTITION_ALREADY_EXPORTED, "Export with key {} already exported or it is being exported. Set `export_merge_tree_partition_force_export` to overwrite it.", export_key); } LOG_INFO(log, "Overwriting export with key {}", export_key); @@ -8527,7 +8501,6 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & manifest.parts = part_names; manifest.create_time = time(nullptr); manifest.max_retries = query_context->getSettingsRef()[Setting::export_merge_tree_partition_max_retries]; - manifest.ttl_seconds = query_context->getSettingsRef()[Setting::export_merge_tree_partition_manifest_ttl]; manifest.task_timeout_seconds = query_context->getSettingsRef()[Setting::export_merge_tree_partition_task_timeout_seconds]; manifest.max_threads = query_context->getSettingsRef()[Setting::max_threads]; manifest.parallel_formatting = query_context->getSettingsRef()[Setting::output_format_parallel_formatting]; diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py index 00cf5b5ff6f2..89ceeca47003 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py @@ -746,13 +746,12 @@ def test_partition_key_compatibility_check(cluster): ) -def test_export_ttl(cluster): +def test_export_entries_never_expire(cluster): """ - After a manifest TTL expires the same partition can be re-exported, and the - new data is appended to (or replaces) what is in the Iceberg table. + Export entries never expire: a second export of the same partition is rejected + indefinitely unless export_merge_tree_partition_force_export is set. """ node = cluster.instances["replica1"] - ttl_seconds = 3 uid = unique_suffix() mt_table = f"mt_{uid}" @@ -763,28 +762,31 @@ def test_export_ttl(cluster): # First export. node.query( f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} " - f"SETTINGS export_merge_tree_partition_manifest_ttl = {ttl_seconds}, allow_insert_into_iceberg = 1" + f"SETTINGS allow_insert_into_iceberg = 1" ) + wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED") + + count_after_first = int(node.query(f"SELECT count() FROM {iceberg_table} WHERE year = 2020").strip()) + assert count_after_first == 3, f"Expected 3 rows after first export, got {count_after_first}" - # A second export before the TTL expires must be rejected. + # A second export is rejected, and stays rejected (the entry never expires). error = node.query_and_get_error( f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", settings={"allow_insert_into_iceberg": 1}, ) - assert "Export with key" in error, f"Expected duplicate-export error before TTL, got: {error}" - - wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED") - - count_after_first = int(node.query(f"SELECT count() FROM {iceberg_table} WHERE year = 2020").strip()) - assert count_after_first == 3, f"Expected 3 rows after first export, got {count_after_first}" + assert "Export with key" in error, f"Expected duplicate-export error, got: {error}" - # Wait for the manifest TTL to expire. - time.sleep(ttl_seconds * 2) + time.sleep(5) + error = node.query_and_get_error( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", + settings={"allow_insert_into_iceberg": 1}, + ) + assert "Export with key" in error, f"Entry must not expire, got: {error}" - # Second export must be accepted now. + # Only force_export can overwrite the existing entry. node.query( f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", - settings={"allow_insert_into_iceberg": 1}, + settings={"allow_insert_into_iceberg": 1, "export_merge_tree_partition_force_export": 1}, ) wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED") @@ -888,7 +890,6 @@ def test_export_task_timeout_kills_stuck_pending_task(cluster): f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}" f" SETTINGS export_merge_tree_partition_task_timeout_seconds = 5," f" export_merge_tree_partition_max_retries = 1000000," - f" export_merge_tree_partition_manifest_ttl = 3600," f" allow_insert_into_iceberg = 1" ) diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py index d403538999df..c4e38cbff276 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py @@ -627,40 +627,43 @@ def test_inject_short_living_failures(cluster): assert int(exception_count.strip()) >= 1, "Expected at least one exception" -def test_export_ttl(cluster): +def test_export_entries_never_expire(cluster): + """Export entries are kept as history; a duplicate export is rejected unless forced.""" node = cluster.instances["replica1"] postfix = str(uuid.uuid4()).replace("-", "_") mt_table = f"export_ttl_mt_table_{postfix}" s3_table = f"export_ttl_s3_table_{postfix}" - expiration_time = 3 - create_tables_and_insert_data(node, mt_table, s3_table, "replica1") # start export - node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_manifest_ttl={expiration_time};") - - # assert that I get an error when trying to export the same partition again, query_and_get_error - error = node.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};") - assert "Export with key" in error, "Expected error about expired export" - - # wait for the export to finish and for the manifest to expire + node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};") wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") - time.sleep(expiration_time * 2) - # assert that the export succeeded, check the commit file assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '1\n', "Export did not succeed" - # start export again - node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table}") + # the entry never expires, so re-exporting the same partition is always rejected + error = node.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};") + assert "Export with key" in error, "Expected duplicate-export error" - # wait for the export to finish + # the completed entry is still present in the history table after a while + time.sleep(5) + assert node.query( + f""" + SELECT status FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + """ + ) == "COMPLETED\n", "Export entry should remain in history and not expire" + + # only force_export can overwrite the existing entry + node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_force_export = 1") wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") - # assert that the export succeeded, check the commit file - # there should be two commit files now, one for the first export and one for the second export - assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '2\n', "Export did not succeed" + # there should be two commit files now, one for each export + assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '2\n', "Forced re-export did not succeed" def test_export_partition_file_already_exists_policy(cluster): From a676814106777b62d94c002e8754582da3ad4e71 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 16 Jun 2026 17:18:32 -0300 Subject: [PATCH 15/33] cleanup --- .../ExportPartitionManifestUpdatingTask.cpp | 43 +++++++----------- .../test.py | 45 ------------------- .../test.py | 39 ---------------- 3 files changed, 17 insertions(+), 110 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index 6926ee77953a..8ba8d8fc64a0 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -130,11 +130,9 @@ namespace /* Enforce the PENDING task timeout and recover non-committed exports that have already exported all parts. Entries are never removed for age — `system.replicated_partition_exports` - is append-only history. - - Always returns false: the entry stays in the in-memory container. A KILLED transition is - driven by the status watch, and a deferred commit is handled by the caller after the lock - is released. + is append-only history, so the entry always stays in the in-memory container: a KILLED + transition is driven by the status watch, and a deferred commit is handled by the caller + after the lock is released. Side outputs: - `deferred_commits`: when a PENDING entry has all parts processed but the export was @@ -142,7 +140,7 @@ namespace the caller after releasing the storage-wide mutex. The actual commit() call (which performs network I/O to the destination catalog and S3) MUST NOT run under the lock. */ - bool tryCleanup( + void tryCleanup( const zkutil::ZooKeeperPtr & zk, const std::string & entry_path, const LoggerPtr & log, @@ -170,14 +168,14 @@ namespace if (!zk->tryGet(status_path, status_string, &status_stat)) { LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to read status for {} while enforcing task timeout, skipping", entry_path); - return false; + return; } const auto current_status = magic_enum::enum_cast(status_string); if (!current_status || *current_status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) { LOG_INFO(log, "ExportPartition Manifest Updating Task: Task {} is not PENDING, can't set to KILLED, skipping", entry_path); - return false; + return; } const auto timeout_message = fmt::format( @@ -217,9 +215,9 @@ namespace entry_path, rc); } - /// Return false so the entry remains in entries_by_key; the status watch will drive + /// The entry remains in entries_by_key; the status watch will drive /// handleStatusChanges -> killExportPart on every replica, mirroring user-initiated KILL. - return false; + return; } else if (is_pending) { @@ -232,7 +230,7 @@ namespace { LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to get parts in processing or pending, skipping"); - return false; + return; } if (parts_in_processing_or_pending.empty()) @@ -244,7 +242,7 @@ namespace if (!destination_storage) { LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to reconstruct destination storage: {}, skipping", destination_storage_id.getNameForLogs()); - return false; + return; } /// A replica exported the last part but the commit never landed. Capture everything @@ -253,22 +251,18 @@ namespace /// MAX_TRANSACTION_RETRIES = 100 retries; holding the storage-wide mutex across /// that work is what caused `system.replicated_partition_exports` to hang. /// - /// Returning false here keeps the outer poll() loop on the normal path: it will - /// call addTask() so the in-memory container reflects the PENDING entry. The - /// status watch registered by poll() will transition the local entry to - /// COMPLETED/FAILED once the deferred commit (or a peer's commit) updates - /// /status in ZooKeeper. + /// The outer poll() loop stays on the normal path: it will call addTask() so the + /// in-memory container reflects the PENDING entry. The status watch registered by + /// poll() will transition the local entry to COMPLETED/FAILED once the deferred + /// commit (or a peer's commit) updates /status in ZooKeeper. deferred_commits.push_back(CommitRecoveryWork{ .metadata = metadata, .entry_path = entry_path, .destination_storage = destination_storage, .context = context, }); - return false; } } - - return false; } } @@ -451,11 +445,11 @@ void ExportPartitionManifestUpdatingTask::poll() continue; } - /// if we have the cleanup lock, try to cleanup - /// if we successfully cleaned it up, early exit + /// If we hold the cleanup lock, enforce the task timeout and recover uncommitted exports. + /// Entries are never removed here, so we always fall through to refresh / addTask below. if (cleanup_lock) { - bool cleanup_successful = tryCleanup( + tryCleanup( zk, entry_path, storage.log.load(), @@ -465,9 +459,6 @@ void ExportPartitionManifestUpdatingTask::poll() now, *status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING, deferred_commits); - - if (cleanup_successful) - continue; } if (has_local_entry_and_is_up_to_date) diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py index 89ceeca47003..62a8a2196313 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py @@ -746,51 +746,6 @@ def test_partition_key_compatibility_check(cluster): ) -def test_export_entries_never_expire(cluster): - """ - Export entries never expire: a second export of the same partition is rejected - indefinitely unless export_merge_tree_partition_force_export is set. - """ - node = cluster.instances["replica1"] - - uid = unique_suffix() - mt_table = f"mt_{uid}" - iceberg_table = f"iceberg_{uid}" - - setup_tables(cluster, mt_table, iceberg_table, nodes=["replica1"]) - - # First export. - node.query( - f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} " - f"SETTINGS allow_insert_into_iceberg = 1" - ) - wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED") - - count_after_first = int(node.query(f"SELECT count() FROM {iceberg_table} WHERE year = 2020").strip()) - assert count_after_first == 3, f"Expected 3 rows after first export, got {count_after_first}" - - # A second export is rejected, and stays rejected (the entry never expires). - error = node.query_and_get_error( - f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", - settings={"allow_insert_into_iceberg": 1}, - ) - assert "Export with key" in error, f"Expected duplicate-export error, got: {error}" - - time.sleep(5) - error = node.query_and_get_error( - f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", - settings={"allow_insert_into_iceberg": 1}, - ) - assert "Export with key" in error, f"Entry must not expire, got: {error}" - - # Only force_export can overwrite the existing entry. - node.query( - f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}", - settings={"allow_insert_into_iceberg": 1, "export_merge_tree_partition_force_export": 1}, - ) - wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED") - - def test_export_data_files_are_not_cleaned_up_on_commit_failure(cluster): """ Verify that the data files are not cleaned up on commit failure and the export is retried. diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py index c4e38cbff276..8ad265a375ad 100644 --- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py +++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py @@ -627,45 +627,6 @@ def test_inject_short_living_failures(cluster): assert int(exception_count.strip()) >= 1, "Expected at least one exception" -def test_export_entries_never_expire(cluster): - """Export entries are kept as history; a duplicate export is rejected unless forced.""" - node = cluster.instances["replica1"] - - postfix = str(uuid.uuid4()).replace("-", "_") - mt_table = f"export_ttl_mt_table_{postfix}" - s3_table = f"export_ttl_s3_table_{postfix}" - - create_tables_and_insert_data(node, mt_table, s3_table, "replica1") - - # start export - node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};") - wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") - - assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '1\n', "Export did not succeed" - - # the entry never expires, so re-exporting the same partition is always rejected - error = node.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};") - assert "Export with key" in error, "Expected duplicate-export error" - - # the completed entry is still present in the history table after a while - time.sleep(5) - assert node.query( - f""" - SELECT status FROM system.replicated_partition_exports - WHERE source_table = '{mt_table}' - AND destination_table = '{s3_table}' - AND partition_id = '2020' - """ - ) == "COMPLETED\n", "Export entry should remain in history and not expire" - - # only force_export can overwrite the existing entry - node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_force_export = 1") - wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") - - # there should be two commit files now, one for each export - assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '2\n', "Forced re-export did not succeed" - - def test_export_partition_file_already_exists_policy(cluster): node = cluster.instances["replica1"] From 16dac3cf25c8c6616945cacb215da10dc1b03111 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 16 Jun 2026 20:57:30 -0300 Subject: [PATCH 16/33] missing piece --- .../design/alter-table-export-part-partition.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/antalya/docs/design/alter-table-export-part-partition.md b/antalya/docs/design/alter-table-export-part-partition.md index d513e4b80e41..8ebe237c634a 100644 --- a/antalya/docs/design/alter-table-export-part-partition.md +++ b/antalya/docs/design/alter-table-export-part-partition.md @@ -422,12 +422,13 @@ The following notes expand on expected behavior of commands. every active part of partition `p` across all replicas that host it; `system.replicated_partition_exports` converges to `COMPLETED`. -4. Re-issuing the same `EXPORT PARTITION` within - `export_merge_tree_partition_manifest_ttl` is a no-op (no - duplicate files) unless `export_merge_tree_partition_force_export = 1`. This - behavior avoids accidentally exporting the same data twice. Note, however - that forcing the operation is dangerous if ClickHouse can't clean up the - previous operation. In this case you'll potentially commit files twice. +4. Re-issuing the same `EXPORT PARTITION` is rejected (no duplicate + files) unless `export_merge_tree_partition_force_export = 1`. Export + entries are kept indefinitely in `system.replicated_partition_exports` + as history; they never expire. This behavior avoids accidentally + exporting the same data twice. Note, however that forcing the + operation is dangerous if ClickHouse can't clean up the previous + operation. In this case you'll potentially commit files twice. 5. Killing an in-flight partition export via `KILL EXPORT PARTITION` transitions status to `KILLED` and stops all replicas' contributions. @@ -481,7 +482,6 @@ The following notes expand on expected behavior of commands. | `export_merge_tree_part_filename_pattern` | query | `{part_name}_{checksum}` | `String` | both | Filename template; supports `{part_name}`, `{checksum}`, `{database}`, `{table}`, server macros. | | `export_merge_tree_partition_force_export` | query | `false` | `Bool` | `EXPORT PARTITION` | Overwrite a live Keeper manifest for the same `(source, destination, partition_id)`. Dangerous — can produce duplicate data on the destination; use with caution. | | `export_merge_tree_partition_max_retries` | query | `3` | `UInt64` | `EXPORT PARTITION` | Retry budget applied to both per-part export attempts and per-task commit attempts (Iceberg). The task fails terminally if commit retries alone exceed the budget. | -| `export_merge_tree_partition_manifest_ttl` | query | `180` (seconds) | `UInt64` | `EXPORT PARTITION` | Live-manifest TTL; acts as the idempotency window. Does not interrupt in-flight tasks. Keep this greater than `export_merge_tree_partition_task_timeout_seconds` if you want the `KILLED` entry to remain visible in `system.replicated_partition_exports` after the timeout fires. | | `export_merge_tree_partition_task_timeout_seconds` | query | `3600` (seconds) | `UInt64` (`0`=disable) | `EXPORT PARTITION` | Wall-clock cap for `PENDING` tasks; on expiry transitions to `KILLED` with a timeout reason. Measured from manifest `create_time`. Enforcement latency ≈ one manifest-updater poll cycle (~30s) plus Keeper watch propagation. | | `export_merge_tree_partition_system_table_prefer_remote_information` | query | `false` | `Bool` | `EXPORT PARTITION` | When `true`, `system.replicated_partition_exports` fetches fresh state from Keeper (requires the `MULTI_READ` feature flag); when `false`, uses local cached state. **Default flipped from `true` to `false` in this release** — Keeper round-trips were more expensive than warranted for the typical observability workload. (See NOTE 2.)| | `export_merge_tree_part_file_already_exists_policy` | query | `skip` | `skip` / `error` / `overwrite` | `EXPORT PARTITION` | Per-file policy during partition export. | @@ -722,7 +722,8 @@ peak memory. Hot path is the Parquet encoder, which warrants a guard against reg (`parts_to_do > 0`) rather than corrupt data. Acceptable but must be documented in the upgrade notes. - **Risk (object-storage cost / accidental large exports):** mitigated by the experimental - gates (default off) and the manifest idempotency window. + gates (default off) and the duplicate-export rejection (an existing export key is refused + unless `export_merge_tree_partition_force_export` is set). - **Risk (Iceberg catalog manifest retention):** if the catalog reaps old manifest files with a retention window shorter than `export_merge_tree_partition_task_timeout_seconds`, the rare "sole-node commits, crashes, recovers after reaper deleted the commit manifest" From 978f9e172f5b9df86e529b017f6c5bff6802087d Mon Sep 17 00:00:00 2001 From: CarlosFelipeOR Date: Thu, 18 Jun 2026 17:39:19 -0300 Subject: [PATCH 17/33] Skip store_data.py pre-hook in MasterCI to avoid Finish Workflow template-size overflow Signed-off-by: CarlosFelipeOR --- ci/workflows/master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/workflows/master.py b/ci/workflows/master.py index cb67d223d13c..e6e9bbdeac88 100644 --- a/ci/workflows/master.py +++ b/ci/workflows/master.py @@ -101,7 +101,7 @@ enable_commit_status_on_failure=True, enable_slack_feed=False, pre_hooks=[ - "python3 ./ci/jobs/scripts/workflow_hooks/store_data.py", + # "python3 ./ci/jobs/scripts/workflow_hooks/store_data.py", # NOTE (carlosfelipeor): we don't use this in master CI "python3 ./ci/jobs/scripts/workflow_hooks/version_log.py", "python3 ./ci/jobs/scripts/workflow_hooks/parse_ci_tags.py", # "python3 ./ci/jobs/scripts/workflow_hooks/merge_sync_pr.py", # NOTE (strtgbb): we don't do this From 03227749150845333ad362aca0961a5bf08c4b5b Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Fri, 27 Mar 2026 13:41:38 -0400 Subject: [PATCH 18/33] Add 'PRs in Release' table to report --- .../ci_run_report.html.jinja | 9 + .../create_workflow_report.py | 311 +++++++++++++++--- 2 files changed, 279 insertions(+), 41 deletions(-) diff --git a/.github/actions/create_workflow_report/ci_run_report.html.jinja b/.github/actions/create_workflow_report/ci_run_report.html.jinja index 4c94465a16c6..a6e0df17c49f 100644 --- a/.github/actions/create_workflow_report/ci_run_report.html.jinja +++ b/.github/actions/create_workflow_report/ci_run_report.html.jinja @@ -168,6 +168,7 @@ {% endif %}

Table of Contents

+ {%- if pr_number == 0 -%} +

PRs in Release

+ {% if prs_in_release_missing_verification %} +

Some PRs are missing verification.

+ {% endif %} + {{ prs_in_release_html }} + {%- endif %} + {%- if pr_number != 0 -%}

New Fails in PR

Compared with base sha {{ base_sha }}

diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py index 8daf0fc24f31..a4f43e172459 100755 --- a/.github/actions/create_workflow_report/create_workflow_report.py +++ b/.github/actions/create_workflow_report/create_workflow_report.py @@ -6,10 +6,12 @@ from itertools import combinations import json from datetime import datetime +from datetime import timezone from functools import lru_cache from glob import glob import urllib.parse import re +import subprocess import pandas as pd from jinja2 import Environment, FileSystemLoader @@ -169,6 +171,196 @@ def get_run_details(run_id: str) -> dict: return response.json() +def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.DataFrame, bool]: + if len(df) == 0: + return pd.DataFrame(columns=["pr_number", "pr_name", "labels"]), False + if not GITHUB_TOKEN: + raise Exception("GITHUB_TOKEN is required to fetch PR titles and labels") + headers = { + "Authorization": f"token {GITHUB_TOKEN}", + "Accept": "application/vnd.github.v3+json", + } + rows = [] + missing_verification = False + for pr_number in df["pr_number"].tolist(): + response = requests.get( + f"https://api.github.com/repos/{repo}/pulls/{pr_number}", + headers=headers, + ) + if response.status_code != 200: + raise Exception( + f"Failed to fetch pull request info: {response.status_code} {response.text}" + ) + pr = response.json() + label_names = [l["name"] for l in pr.get("labels", [])] + if any(ln.lower() == "cicd" for ln in label_names): + continue + lowered = {ln.lower() for ln in label_names} + if "verified" not in lowered and "verified-with-issue" not in lowered: + missing_verification = True + rows.append( + { + "pr_number": pr_number, + "pr_name": pr.get("title", ""), + "labels": ", ".join(sorted(label_names)), + } + ) + return pd.DataFrame(rows), missing_verification + + +def _git_rev_parse(ref: str, cwd: str | None) -> str | None: + p = subprocess.run( + ["git", "rev-parse", "--verify", ref], + cwd=cwd, + capture_output=True, + text=True, + ) + if p.returncode != 0: + return None + return p.stdout.strip() + + +def _git_is_ancestor(ancestor: str, descendant: str, cwd: str | None) -> bool: + p = subprocess.run( + ["git", "merge-base", "--is-ancestor", ancestor, descendant], + cwd=cwd, + capture_output=True, + ) + return p.returncode == 0 + + +def _git_log_merge_prs( + baseline: str, branch_ref: str, cwd: str | None, repo: str +) -> pd.DataFrame: + p = subprocess.run( + [ + "git", + "-c", + "core.quotepath=false", + "log", + f"{baseline}..{branch_ref}", + "--merges", + "--format=%H%x09%s", + ], + cwd=cwd, + capture_output=True, + text=True, + check=True, + ) + rows = [] + for line in p.stdout.splitlines(): + if not line.strip(): + continue + sha, subject = line.split("\t", 1) + m = re.match( + r"Merge pull request #(\d+) from ([^/\s]+)/", subject, re.IGNORECASE + ) + if not m: + continue + pr_number, head_owner = int(m.group(1)), m.group(2) + if head_owner.lower() != repo.split("/")[0].lower(): + continue + rows.append( + { + "pr_number": pr_number, + "merge_commit_sha": sha, + "merge_subject": subject, + } + ) + if not rows: + return pd.DataFrame(columns=["pr_number", "merge_commit_sha", "merge_subject"]) + df = pd.DataFrame(rows) + df = df.drop_duplicates(subset=["pr_number"], keep="first") + return df + + +def _find_release_baseline( + branch_ref: str, repo: str, cwd: str | None +) -> tuple[str | None, str | None]: + if not GITHUB_TOKEN: + return None, None + headers = { + "Authorization": f"token {GITHUB_TOKEN}", + "Accept": "application/vnd.github.v3+json", + } + response = requests.get( + f"https://api.github.com/repos/{repo}/releases?per_page=100", + headers=headers, + ) + if response.status_code != 200: + raise Exception( + f"GitHub API request failed: {response.status_code} {response.text}" + ) + for rel in response.json(): + if rel.get("draft"): + continue + tag_name = rel.get("tag_name") + if not tag_name: + continue + tag_sha = _git_rev_parse(tag_name, cwd) + if not tag_sha: + continue + if not _git_is_ancestor(tag_sha, branch_ref, cwd): + continue + return tag_name, tag_sha + return None, None + + +def _find_rebase_baseline(branch_ref: str, cwd: str | None) -> str | None: + p = subprocess.run( + [ + "git", + "log", + branch_ref, + "--reverse", + "--merges", + "-i", + "--grep=rebase-cicd", + "--grep=rebase/", + "--format=%H", + ], + cwd=cwd, + capture_output=True, + text=True, + ) + if p.returncode != 0: + return None + lines = [ln for ln in p.stdout.splitlines() if ln.strip()] + if not lines: + return None + return lines[0] + + +def get_prs_in_release_dataframe( + branch_ref: str = "HEAD", + *, + repo: str = GITHUB_REPO, + cwd: str | None = None, +) -> tuple[pd.DataFrame, bool]: + """ + PRs merged into branch_ref that belong in the next release notes: after the latest GitHub + Release tag on this history, or after the oldest rebase bootstrap if no such tag exists. + Only merge commits whose subject has from / (e.g. from Altinity/) are included. + Columns: pr_number, pr_name, labels. Omits PRs labeled cicd. + Second value is True if any listed PR lacks verified or verified-with-issue. + """ + branch_sha = _git_rev_parse(branch_ref, cwd) + if not branch_sha: + raise Exception(f"Cannot resolve branch ref: {branch_ref!r}") + + baseline_ref, baseline_sha = _find_release_baseline(branch_ref, repo, cwd) + if not baseline_sha: + rebase_sha = _find_rebase_baseline(branch_ref, cwd) + if not rebase_sha: + raise Exception( + "No release tag on this branch and no rebase bootstrap merge found" + ) + baseline_sha = rebase_sha + + df = _git_log_merge_prs(baseline_sha, branch_ref, cwd, repo) + return _enrich_prs_in_release_merge_prs(df, repo) + + def _checks_latest_test_status_cte(commit_sha: str, branch_name: str) -> str: """ Shared filtering for gh-data.checks: anchor time excludes stateless teardown checks @@ -622,25 +814,40 @@ def format_test_status(text: str) -> str: def format_results_as_html_table(results) -> str: + if not isinstance(results, pd.DataFrame): + return results + if len(results) == 0: return "

Nothing to report

" - results.columns = [col.replace("_", " ").title() for col in results.columns] + + results = results.copy() + + def format_col_name(col_name: str) -> str: + return col_name.replace("_", " ").title().replace("Pr ", "PR ") + + results.columns = [format_col_name(col) for col in results.columns] + + formatters = { + "Results Link": url_to_html_link, + "Test Name": format_test_name_for_linewrap, + "Test Status": format_test_status, + "Job Status": format_test_status, + "Status": format_test_status, + "Message": lambda m: m.replace("\n", " "), + "Identifier": lambda i: url_to_html_link( + "https://nvd.nist.gov/vuln/detail/" + i + ), + "Severity": lambda s: ( + f'{s}' + ), + "PR Number": lambda n: url_to_html_link( + f"https://github.com/{GITHUB_REPO}/pull/{n}" + ), + } + html = results.to_html( index=False, - formatters={ - "Results Link": url_to_html_link, - "Test Name": format_test_name_for_linewrap, - "Test Status": format_test_status, - "Job Status": format_test_status, - "Status": format_test_status, - "Message": lambda m: m.replace("\n", " "), - "Identifier": lambda i: url_to_html_link( - "https://nvd.nist.gov/vuln/detail/" + i - ), - "Severity": lambda s: ( - f'{s}' - ), - }, + formatters=formatters, escape=False, border=0, classes=["test-results-table"], @@ -816,7 +1023,9 @@ def create_workflow_report( settings={"use_numpy": True}, ) - fail_results = { + prs_in_release_missing_verification = False + results_dfs = { + "prs_in_release": [], "job_statuses": get_commit_statuses(commit_sha), "checks_fails": get_checks_fails(db_client, commit_sha, branch_name), "checks_known_fails": [], @@ -826,8 +1035,17 @@ def create_workflow_report( "docker_images_cves": [], } + if pr_number == 0 and not mark_preview: + try: + prs_df, prs_in_release_missing_verification = get_prs_in_release_dataframe( + branch_name, cwd=os.getcwd() + ) + results_dfs["prs_in_release"] = prs_df + except Exception as e: + print(f"Error in get_prs_in_release_dataframe: {e}") + try: - fail_results["docker_images_cves"] = ( + results_dfs["docker_images_cves"] = ( [] if not check_cves else get_cves(pr_number, commit_sha, branch_name) ) except Exception as e: @@ -835,7 +1053,7 @@ def create_workflow_report( # get_cves returns ... in the case where no Grype result files were found. # This might occur when run in preview mode. - cves_not_checked = not check_cves or fail_results["docker_images_cves"] is ... + cves_not_checked = not check_cves or results_dfs["docker_images_cves"] is ... if known_fails_file_path: if not os.path.exists(known_fails_file_path): @@ -843,7 +1061,7 @@ def create_workflow_report( else: known_fails = get_broken_tests_rules(known_fails_file_path) - fail_results["checks_known_fails"] = get_checks_known_fails( + results_dfs["checks_known_fails"] = get_checks_known_fails( db_client, commit_sha, branch_name, known_fails ) @@ -855,24 +1073,24 @@ def create_workflow_report( pr_info_html = f""" #{pr_info.get("number")} ({pr_info.get("base", {}).get('ref')} <- {pr_info.get("head", {}).get('ref')}) {pr_info.get("title")} """ - fail_results["pr_new_fails"] = get_new_fails_this_pr( + results_dfs["pr_new_fails"] = get_new_fails_this_pr( db_client, pr_info, - fail_results["checks_fails"], - fail_results["regression_fails"], + results_dfs["checks_fails"], + results_dfs["regression_fails"], ) except Exception as e: pr_info_html = e pr_info = {} - fail_results["job_statuses"] = backfill_skipped_statuses( - fail_results["job_statuses"], pr_number, branch_name, commit_sha + results_dfs["job_statuses"] = backfill_skipped_statuses( + results_dfs["job_statuses"], pr_number, branch_name, commit_sha ) high_cve_count = 0 - if not cves_not_checked and len(fail_results["docker_images_cves"]) > 0: + if not cves_not_checked and len(results_dfs["docker_images_cves"]) > 0: high_cve_count = ( - fail_results["docker_images_cves"]["severity"] + results_dfs["docker_images_cves"]["severity"] .str.lower() .isin(("high", "critical")) .sum() @@ -893,43 +1111,54 @@ def create_workflow_report( "workflow_id": run_id, "commit_sha": commit_sha, "base_sha": "" if pr_number == 0 else pr_info.get("base", {}).get("sha"), - "date": f"{datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC", + "date": f"{datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC", "is_preview": mark_preview, + "prs_in_release_missing_verification": prs_in_release_missing_verification, "counts": { - "jobs_status": f"{sum(fail_results['job_statuses']['job_status'].value_counts().get(x, 0) for x in ('failure', 'error'))} fail/error", - "checks_errors": len(fail_results["checks_errors"]), - "checks_new_fails": len(fail_results["checks_fails"]), - "regression_new_fails": len(fail_results["regression_fails"]), + "jobs_status": f"{sum(results_dfs['job_statuses']['job_status'].value_counts().get(x, 0) for x in ('failure', 'error'))} fail/error", + "checks_errors": len(results_dfs["checks_errors"]), + "checks_new_fails": len(results_dfs["checks_fails"]), + "regression_new_fails": len(results_dfs["regression_fails"]), "cves": "N/A" if cves_not_checked else f"{high_cve_count} high/critical", "checks_known_fails": ( - "N/A" if not known_fails else len(fail_results["checks_known_fails"]) + "N/A" if not known_fails else len(results_dfs["checks_known_fails"]) + ), + "pr_new_fails": len(results_dfs["pr_new_fails"]), + "prs_in_release": ( + "N/A" + if mark_preview or pr_number != 0 + else len(results_dfs["prs_in_release"]) ), - "pr_new_fails": len(fail_results["pr_new_fails"]), }, "build_report_links": get_build_report_links( - fail_results["job_statuses"], pr_number, branch_name, commit_sha + results_dfs["job_statuses"], pr_number, branch_name, commit_sha + ), + "prs_in_release_html": ( + "

PR details are not loaded during preview.

" + if mark_preview or pr_number != 0 + else format_results_as_html_table(results_dfs["prs_in_release"]) ), "ci_jobs_status_html": format_results_as_html_table( - fail_results["job_statuses"] + results_dfs["job_statuses"] ), "checks_errors_html": format_results_as_html_table( - fail_results["checks_errors"] + results_dfs["checks_errors"] ), - "checks_fails_html": format_results_as_html_table(fail_results["checks_fails"]), + "checks_fails_html": format_results_as_html_table(results_dfs["checks_fails"]), "regression_fails_html": format_results_as_html_table( - fail_results["regression_fails"] + results_dfs["regression_fails"] ), "docker_images_cves_html": ( "

Not Checked

" if cves_not_checked - else format_results_as_html_table(fail_results["docker_images_cves"]) + else format_results_as_html_table(results_dfs["docker_images_cves"]) ), "checks_known_fails_html": ( "

Not Checked

" if not known_fails - else format_results_as_html_table(fail_results["checks_known_fails"]) + else format_results_as_html_table(results_dfs["checks_known_fails"]) ), - "new_fails_html": format_results_as_html_table(fail_results["pr_new_fails"]), + "new_fails_html": format_results_as_html_table(results_dfs["pr_new_fails"]), } # Render the template with the context From c175440d692047c4fbcedac35d1c4462321570c2 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:37:14 -0400 Subject: [PATCH 19/33] highlight labels for unverified PRs --- .../create_workflow_report.py | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py index a4f43e172459..505872d6260b 100755 --- a/.github/actions/create_workflow_report/create_workflow_report.py +++ b/.github/actions/create_workflow_report/create_workflow_report.py @@ -32,6 +32,8 @@ CVE_SEVERITY_ORDER = {"critical": 1, "high": 2, "medium": 3, "low": 4, "negligible": 5} +PR_LABELS_VERIFIED = {"verified", "verified-with-issue", "no-verification-needed"} + def _is_clickhouse_memory_limit_error(exc: BaseException) -> bool: if isinstance(exc, ServerException) and getattr(exc, "code", None) == 241: return True @@ -173,7 +175,7 @@ def get_run_details(run_id: str) -> dict: def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.DataFrame, bool]: if len(df) == 0: - return pd.DataFrame(columns=["pr_number", "pr_name", "labels"]), False + return pd.DataFrame(columns=["pr_number", "pr_name", "pr_labels"]), False if not GITHUB_TOKEN: raise Exception("GITHUB_TOKEN is required to fetch PR titles and labels") headers = { @@ -193,16 +195,15 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.Da ) pr = response.json() label_names = [l["name"] for l in pr.get("labels", [])] - if any(ln.lower() == "cicd" for ln in label_names): + if "cicd" in label_names: continue - lowered = {ln.lower() for ln in label_names} - if "verified" not in lowered and "verified-with-issue" not in lowered: + if not PR_LABELS_VERIFIED.intersection(label_names): missing_verification = True rows.append( { "pr_number": pr_number, "pr_name": pr.get("title", ""), - "labels": ", ".join(sorted(label_names)), + "pr_labels": ", ".join(sorted(label_names)), } ) return pd.DataFrame(rows), missing_verification @@ -337,12 +338,12 @@ def get_prs_in_release_dataframe( repo: str = GITHUB_REPO, cwd: str | None = None, ) -> tuple[pd.DataFrame, bool]: - """ + f""" PRs merged into branch_ref that belong in the next release notes: after the latest GitHub Release tag on this history, or after the oldest rebase bootstrap if no such tag exists. Only merge commits whose subject has from / (e.g. from Altinity/) are included. Columns: pr_number, pr_name, labels. Omits PRs labeled cicd. - Second value is True if any listed PR lacks verified or verified-with-issue. + Second value is True if any listed PR lacks verified labels. """ branch_sha = _git_rev_parse(branch_ref, cwd) if not branch_sha: @@ -794,6 +795,15 @@ def url_to_html_link(url: str) -> str: return f'{text}' +def format_pr_labels_with_verification(labels: str) -> str: + """Format the PR labels with verification.""" + labels_list = labels.split(", ") + if PR_LABELS_VERIFIED.intersection(labels_list): + return labels + else: + return f'{labels} (missing verification)' + + def format_test_name_for_linewrap(text: str) -> str: """Tweak the test name to improve line wrapping.""" return f'{text}' @@ -843,6 +853,7 @@ def format_col_name(col_name: str) -> str: "PR Number": lambda n: url_to_html_link( f"https://github.com/{GITHUB_REPO}/pull/{n}" ), + "PR Labels": format_pr_labels_with_verification, } html = results.to_html( From a7113d0e0a3345ee8ebc3ae683a2407c003c9b42 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Tue, 31 Mar 2026 11:43:21 -0400 Subject: [PATCH 20/33] report: fix _find_rebase_baseline --- .../create_workflow_report/create_workflow_report.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py index 505872d6260b..bade4193aaac 100755 --- a/.github/actions/create_workflow_report/create_workflow_report.py +++ b/.github/actions/create_workflow_report/create_workflow_report.py @@ -314,10 +314,11 @@ def _find_rebase_baseline(branch_ref: str, cwd: str | None) -> str | None: "log", branch_ref, "--reverse", - "--merges", "-i", - "--grep=rebase-cicd", - "--grep=rebase/", + "-E", + "--grep=^Rebase CICD", + "--grep=Merge pull request .*rebase-cicd", + "--grep=Merge pull request .*from Altinity/rebase/", "--format=%H", ], cwd=cwd, From 458420e92b659310166826514d161ab1311e610f Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Tue, 31 Mar 2026 20:47:20 -0400 Subject: [PATCH 21/33] ensure extra git history is fetched in get_prs_in_release_dataframe --- .../create_workflow_report.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py index bade4193aaac..778452ef0a6a 100755 --- a/.github/actions/create_workflow_report/create_workflow_report.py +++ b/.github/actions/create_workflow_report/create_workflow_report.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse import os +import sys import time from pathlib import Path from itertools import combinations @@ -337,7 +338,7 @@ def get_prs_in_release_dataframe( branch_ref: str = "HEAD", *, repo: str = GITHUB_REPO, - cwd: str | None = None, + cwd: str, ) -> tuple[pd.DataFrame, bool]: f""" PRs merged into branch_ref that belong in the next release notes: after the latest GitHub @@ -350,8 +351,25 @@ def get_prs_in_release_dataframe( if not branch_sha: raise Exception(f"Cannot resolve branch ref: {branch_ref!r}") + # CI often checks out with --depth=1 and --no-tags; fetch enough history and tags once. + subprocess.run( + ["git", "fetch", "--tags", "origin"], + cwd=cwd, + capture_output=True, + text=True, + check=False, + ) + baseline_ref, baseline_sha = _find_release_baseline(branch_ref, repo, cwd) if not baseline_sha: + # If no release tag, try to find rebase commit 200 commits back. + subprocess.run( + ["git", "fetch", "--deepen=200", "origin", branch_ref], + cwd=cwd, + capture_output=True, + text=True, + check=False, + ) rebase_sha = _find_rebase_baseline(branch_ref, cwd) if not rebase_sha: raise Exception( From 27215e627360393b9198cab5496ea8f117d1a1c3 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Wed, 1 Apr 2026 09:53:34 -0400 Subject: [PATCH 22/33] no longer consider labels no-verification-needed and cicd --- .../actions/create_workflow_report/create_workflow_report.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py index 778452ef0a6a..1eddb30b83eb 100755 --- a/.github/actions/create_workflow_report/create_workflow_report.py +++ b/.github/actions/create_workflow_report/create_workflow_report.py @@ -33,7 +33,7 @@ CVE_SEVERITY_ORDER = {"critical": 1, "high": 2, "medium": 3, "low": 4, "negligible": 5} -PR_LABELS_VERIFIED = {"verified", "verified-with-issue", "no-verification-needed"} +PR_LABELS_VERIFIED = {"verified", "verified-with-issue"} def _is_clickhouse_memory_limit_error(exc: BaseException) -> bool: if isinstance(exc, ServerException) and getattr(exc, "code", None) == 241: @@ -196,8 +196,6 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.Da ) pr = response.json() label_names = [l["name"] for l in pr.get("labels", [])] - if "cicd" in label_names: - continue if not PR_LABELS_VERIFIED.intersection(label_names): missing_verification = True rows.append( From 7618b0b5ffdd204743da4257dfcf80f423e31157 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Wed, 1 Apr 2026 10:11:57 -0400 Subject: [PATCH 23/33] simplify verified check --- .../ci_run_report.html.jinja | 3 --- .../create_workflow_report.py | 21 ++++++------------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/.github/actions/create_workflow_report/ci_run_report.html.jinja b/.github/actions/create_workflow_report/ci_run_report.html.jinja index a6e0df17c49f..eba0a862561f 100644 --- a/.github/actions/create_workflow_report/ci_run_report.html.jinja +++ b/.github/actions/create_workflow_report/ci_run_report.html.jinja @@ -180,9 +180,6 @@ {%- if pr_number == 0 -%}

PRs in Release

- {% if prs_in_release_missing_verification %} -

Some PRs are missing verification.

- {% endif %} {{ prs_in_release_html }} {%- endif %} diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py index 1eddb30b83eb..8e03badf464c 100755 --- a/.github/actions/create_workflow_report/create_workflow_report.py +++ b/.github/actions/create_workflow_report/create_workflow_report.py @@ -33,7 +33,6 @@ CVE_SEVERITY_ORDER = {"critical": 1, "high": 2, "medium": 3, "low": 4, "negligible": 5} -PR_LABELS_VERIFIED = {"verified", "verified-with-issue"} def _is_clickhouse_memory_limit_error(exc: BaseException) -> bool: if isinstance(exc, ServerException) and getattr(exc, "code", None) == 241: @@ -174,9 +173,9 @@ def get_run_details(run_id: str) -> dict: return response.json() -def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.DataFrame, bool]: +def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> pd.DataFrame: if len(df) == 0: - return pd.DataFrame(columns=["pr_number", "pr_name", "pr_labels"]), False + return pd.DataFrame(columns=["pr_number", "pr_name", "pr_labels"]) if not GITHUB_TOKEN: raise Exception("GITHUB_TOKEN is required to fetch PR titles and labels") headers = { @@ -184,7 +183,6 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.Da "Accept": "application/vnd.github.v3+json", } rows = [] - missing_verification = False for pr_number in df["pr_number"].tolist(): response = requests.get( f"https://api.github.com/repos/{repo}/pulls/{pr_number}", @@ -196,8 +194,6 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.Da ) pr = response.json() label_names = [l["name"] for l in pr.get("labels", [])] - if not PR_LABELS_VERIFIED.intersection(label_names): - missing_verification = True rows.append( { "pr_number": pr_number, @@ -205,7 +201,7 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.Da "pr_labels": ", ".join(sorted(label_names)), } ) - return pd.DataFrame(rows), missing_verification + return pd.DataFrame(rows) def _git_rev_parse(ref: str, cwd: str | None) -> str | None: @@ -337,13 +333,12 @@ def get_prs_in_release_dataframe( *, repo: str = GITHUB_REPO, cwd: str, -) -> tuple[pd.DataFrame, bool]: +) -> pd.DataFrame: f""" PRs merged into branch_ref that belong in the next release notes: after the latest GitHub Release tag on this history, or after the oldest rebase bootstrap if no such tag exists. Only merge commits whose subject has from / (e.g. from Altinity/) are included. Columns: pr_number, pr_name, labels. Omits PRs labeled cicd. - Second value is True if any listed PR lacks verified labels. """ branch_sha = _git_rev_parse(branch_ref, cwd) if not branch_sha: @@ -815,7 +810,7 @@ def url_to_html_link(url: str) -> str: def format_pr_labels_with_verification(labels: str) -> str: """Format the PR labels with verification.""" labels_list = labels.split(", ") - if PR_LABELS_VERIFIED.intersection(labels_list): + if "verified" in labels_list: return labels else: return f'{labels} (missing verification)' @@ -1051,7 +1046,6 @@ def create_workflow_report( settings={"use_numpy": True}, ) - prs_in_release_missing_verification = False results_dfs = { "prs_in_release": [], "job_statuses": get_commit_statuses(commit_sha), @@ -1065,9 +1059,7 @@ def create_workflow_report( if pr_number == 0 and not mark_preview: try: - prs_df, prs_in_release_missing_verification = get_prs_in_release_dataframe( - branch_name, cwd=os.getcwd() - ) + prs_df = get_prs_in_release_dataframe(branch_name, cwd=os.getcwd()) results_dfs["prs_in_release"] = prs_df except Exception as e: print(f"Error in get_prs_in_release_dataframe: {e}") @@ -1141,7 +1133,6 @@ def create_workflow_report( "base_sha": "" if pr_number == 0 else pr_info.get("base", {}).get("sha"), "date": f"{datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC", "is_preview": mark_preview, - "prs_in_release_missing_verification": prs_in_release_missing_verification, "counts": { "jobs_status": f"{sum(results_dfs['job_statuses']['job_status'].value_counts().get(x, 0) for x in ('failure', 'error'))} fail/error", "checks_errors": len(results_dfs["checks_errors"]), From 8d7c262f11db1e1038c5db9a0d1df848786772f7 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Mon, 18 May 2026 14:14:14 -0400 Subject: [PATCH 24/33] swap columns --- .../actions/create_workflow_report/create_workflow_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py index 8e03badf464c..12adda5c8608 100755 --- a/.github/actions/create_workflow_report/create_workflow_report.py +++ b/.github/actions/create_workflow_report/create_workflow_report.py @@ -196,8 +196,8 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> pd.DataFram label_names = [l["name"] for l in pr.get("labels", [])] rows.append( { - "pr_number": pr_number, "pr_name": pr.get("title", ""), + "pr_number": pr_number, "pr_labels": ", ".join(sorted(label_names)), } ) From 6e748a713cbd50eb756d9482813a1e38abac4524 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Thu, 11 Jun 2026 15:14:18 -0400 Subject: [PATCH 25/33] add tabbed navigation to report --- .../ci_run_report.html.jinja | 169 +++++++++++++----- 1 file changed, 128 insertions(+), 41 deletions(-) diff --git a/.github/actions/create_workflow_report/ci_run_report.html.jinja b/.github/actions/create_workflow_report/ci_run_report.html.jinja index eba0a862561f..86cf5cbc8ba2 100644 --- a/.github/actions/create_workflow_report/ci_run_report.html.jinja +++ b/.github/actions/create_workflow_report/ci_run_report.html.jinja @@ -13,10 +13,11 @@ :root { --altinity-background: #000D45; --altinity-accent: #189DCF; - --altinity-highlight: #FFC600; + --altinity-link-hover: #FFC600; --altinity-gray: #6c757d; --altinity-light-gray: #f8f9fa; --altinity-white: #ffffff; + --altinity-hover: color-mix(in srgb, var(--altinity-accent) 12%, var(--altinity-white)); } /* Body and heading fonts */ @@ -45,7 +46,7 @@ /* General table styling */ table { - min-width: min(900px, 98vw); + min-width: min(900px, 100%); margin: 1rem 0; border-collapse: collapse; background-color: var(--altinity-white); @@ -107,7 +108,7 @@ /* Table body row styling */ tr:hover { - background-color: var(--altinity-light-gray); + background-color: var(--altinity-hover); } /* Table cell styling */ @@ -123,9 +124,69 @@ } a:hover { - color: var(--altinity-highlight); + color: var(--altinity-link-hover); text-decoration: underline; } + + /* Tab navigation: wrapping pill strip on a white surface that matches the tables */ + .tab { + display: flex; + flex-wrap: wrap; + gap: 0.375rem; + padding: 0.375rem; + border-radius: 0.75rem; + width: fit-content; + max-width: 100%; + margin: 1.5rem 0; + background: var(--altinity-white); + } + + .tab button.tablinks { + appearance: none; + border: 0; + background: transparent; + padding: 0.55rem 0.95rem; + border-radius: 0.5rem; + font: inherit; + cursor: pointer; + white-space: nowrap; + color: var(--altinity-background); + transition: background 0.12s, color 0.12s; + } + + .tab button.tablinks:hover { + background: var(--altinity-hover); + } + + .tab button.tablinks.active { + background: var(--altinity-accent); + color: var(--altinity-white); + font-weight: 600; + } + + .tabcontent { + display: none; + padding: 0; + overflow-x: auto; + animation: fadeEffect 0.3s; + } + + .tabcontent table { + min-width: 0; + } + + .tabcontent > *:first-child { + margin-top: 0; + } + + .tabcontent.active { + display: block; + } + + @keyframes fadeEffect { + from { opacity: 0; } + to { opacity: 1; } + } {{ title }} This is a preview. The workflow is not yet finished.

{% endif %} -

Table of Contents

- - - {%- if pr_number == 0 -%} -

PRs in Release

- {{ prs_in_release_html }} +
+ {%- if pr_number == 0 %}{% endif %} + {%- if pr_number != 0 %}{% endif %} + + + + + + +
+ + {%- if pr_number == 0 %} +
+ {{ prs_in_release_html }} +
{%- endif %} - {%- if pr_number != 0 -%} -

New Fails in PR

-

Compared with base sha {{ base_sha }}

- {{ new_fails_html }} + {%- if pr_number != 0 %} +
+

Compared with base sha {{ base_sha }}

+ {{ new_fails_html }} +
{%- endif %} -

CI Jobs Status

- {{ ci_jobs_status_html }} +
+ {{ ci_jobs_status_html }} +
+ +
+ {{ checks_errors_html }} +
-

Checks Errors

- {{ checks_errors_html }} +
+ {{ checks_fails_html }} +
-

Checks New Fails

- {{ checks_fails_html }} +
+ {{ regression_fails_html }} +
-

Regression New Fails

- {{ regression_fails_html }} +
+ {{ docker_images_cves_html }} +
-

Docker Images CVEs

- {{ docker_images_cves_html }} +
+

+ Fail reason conventions:
+ KNOWN - Accepted fail and fix is not planned
+ INVESTIGATE - We don't know why it fails
+ NEEDSFIX - Investigation done and a fix is needed to make it pass
+

+ {{ checks_known_fails_html }} +
-

Checks Known Fails

-

- Fail reason conventions:
- KNOWN - Accepted fail and fix is not planned
- INVESTIGATE - We don't know why it fails
- NEEDSFIX - Investigation done and a fix is needed to make it pass
-

- {{ checks_known_fails_html }} +