From 5ec03a9700f77ba6dd465e84b4775d4a7e4323e9 Mon Sep 17 00:00:00 2001
From: "Daniel Q. Kim"
Date: Wed, 20 May 2026 09:24:50 +0200
Subject: [PATCH 01/33] Fix Iceberg read optimization returning NULLs for
stats-less manifests
When an Iceberg manifest's per-file column statistics are absent or
empty (common for non-Spark writers like pyiceberg with default
settings), DataFileMetaInfo::columns_info is empty. The optimization
in StorageObjectStorageSource::createReader misread this as "all
columns are absent from the file" and returned constant NULLs for
every row while still returning the correct row count. Result: silent
data loss on icebergLocal, icebergS3, icebergAzure, icebergHDFS, and
all *Cluster variants.
Gate the optimization's absent-NULL loop directly on
columns_info.empty() instead of introducing a separate stats-presence
flag. When no usable per-column stats were parsed -- whether the
manifest omitted the stats fields entirely or declared them but left
them empty -- fall through to the Parquet reader, which correctly
handles physically-present columns (read normally) and
schema-evolved-absent columns (handled by
IcebergMetadata::getInitialSchemaByPath setting the file's own schema
as initial_header). columns_info is already serialized to workers in
the cluster JSON path, so this changes no serialization format and
keeps the fork's DataFileMetaInfo serde identical to upstream.
Closes Altinity/ClickHouse#1545.
Mirror of Altinity/ClickHouse#1688 (antalya-25.8 fix).
Signed-off-by: Daniel Q. Kim
---
.../StorageObjectStorageSource.cpp | 59 +-
...t_iceberg_read_optimization_empty_stats.py | 685 ++++++++++++++++++
2 files changed, 716 insertions(+), 28 deletions(-)
create mode 100644 tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
index ee0cf191045c..a4a92ab58599 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@@ -886,40 +886,43 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
}
}
}
- for (const auto & column : requested_columns_list)
+ if (file_meta_data.has_value() && !file_meta_data.value()->columns_info.empty())
{
- const auto & column_name = column.first;
+ for (const auto & column : requested_columns_list)
+ {
+ const auto & column_name = column.first;
- if (file_meta_data.value()->columns_info.contains(column_name))
- continue;
+ if (file_meta_data.value()->columns_info.contains(column_name))
+ continue;
- if (!column.second.second.type->isNullable())
- continue;
+ if (!column.second.second.type->isNullable())
+ continue;
- /// With View over Iceberg table we have someting like 'materialize(time)' as column_name
- /// Simple cheap check
- if (column_name.starts_with("materialize(") && column_name.ends_with(")"))
- continue;
+ /// With View over Iceberg table we have someting like 'materialize(time)' as column_name
+ /// Simple cheap check
+ if (column_name.starts_with("materialize(") && column_name.ends_with(")"))
+ continue;
- /// Skip columns produced by prewhere or row-level filter expressions —
- /// they are computed at read time, not stored in the file.
- if (format_filter_info
- && ((format_filter_info->prewhere_info && column_name == format_filter_info->prewhere_info->prewhere_column_name)
- || (format_filter_info->row_level_filter && column_name == format_filter_info->row_level_filter->column_name)))
- continue;
+ /// Skip columns produced by prewhere or row-level filter expressions —
+ /// they are computed at read time, not stored in the file.
+ if (format_filter_info
+ && ((format_filter_info->prewhere_info && column_name == format_filter_info->prewhere_info->prewhere_column_name)
+ || (format_filter_info->row_level_filter && column_name == format_filter_info->row_level_filter->column_name)))
+ continue;
- /// Column is nullable and absent in file
- constant_columns_with_values[column.second.first] =
- ConstColumnWithValue{
- column.second.second,
- Field()
- };
- constant_columns.insert(column_name);
-
- LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'",
- object_info->getPath(),
- column_name,
- column.second.second.type);
+ /// Column is nullable and absent in file
+ constant_columns_with_values[column.second.first] =
+ ConstColumnWithValue{
+ column.second.second,
+ Field()
+ };
+ constant_columns.insert(column_name);
+
+ LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'",
+ object_info->getPath(),
+ column_name,
+ column.second.second.type);
+ }
}
}
diff --git a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
new file mode 100644
index 000000000000..34dc2fda2918
--- /dev/null
+++ b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
@@ -0,0 +1,685 @@
+#!/usr/bin/env python3
+"""
+Reproducer for Altinity/ClickHouse#1545:
+icebergLocal() returns all-NULL columns when allow_experimental_iceberg_read_optimization=1
+and the Iceberg manifest has no column statistics.
+
+Root cause
+----------
+The manifest Avro writer schema intentionally omits the optional stats fields
+(value_counts, column_sizes, null_value_counts, lower_bounds, upper_bounds).
+AvroForIcebergDeserializer.hasPath() therefore returns false for all three stats
+paths in ManifestFile.cpp. columns_infos is left empty.
+
+StorageObjectStorageSource::createReader's second loop (the schema-evolution
+"absent column as NULL" path) iterates over every requested column. Because
+columns_infos is empty, every nullable column is absent → each is injected as a
+constant NULL. With all requested columns constant, need_only_count is set to
+true and the Parquet file is read in count-only mode; the output is the correct
+number of rows, but every value is NULL.
+
+Tests
+-----
+- test_iceberg_local_returns_actual_rows_with_stats_less_manifest
+ FAILS on unpatched code: returns all-NULL rows instead of real data.
+
+- test_iceberg_local_returns_correct_rows_when_optimization_disabled
+ PASSES on unpatched code: regression guard — optimization=0 bypasses the
+ buggy path and reads the Parquet file normally.
+"""
+
+import json
+import os
+import tempfile
+import time
+import uuid
+
+import avro.datafile
+import avro.io
+import avro.schema
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from helpers.iceberg_utils import get_uuid_str
+from helpers.s3_tools import LocalUploader
+
+
+# Iceberg v2 manifest list Avro schema (minimal fields needed by ClickHouse).
+_MANIFEST_LIST_SCHEMA_STR = json.dumps({
+ "type": "record",
+ "name": "manifest_file",
+ "fields": [
+ {"name": "manifest_path", "type": "string"},
+ {"name": "manifest_length", "type": "long"},
+ {"name": "partition_spec_id", "type": "int"},
+ {"name": "content", "type": "int"},
+ {"name": "sequence_number", "type": "long"},
+ {"name": "min_sequence_number", "type": "long"},
+ {"name": "added_snapshot_id", "type": "long"},
+ {"name": "added_files_count", "type": "int"},
+ {"name": "existing_files_count", "type": "int"},
+ {"name": "deleted_files_count", "type": "int"},
+ {"name": "added_rows_count", "type": "long"},
+ {"name": "existing_rows_count", "type": "long"},
+ {"name": "deleted_rows_count", "type": "long"},
+ ],
+})
+
+# Manifest entry Avro schema that deliberately omits all per-column stats fields:
+# column_sizes, value_counts, null_value_counts, lower_bounds, upper_bounds.
+#
+# Absent fields → AvroForIcebergDeserializer.hasPath() returns false for each →
+# ManifestFile.cpp skips stats collection → columns_infos is empty →
+# the optimization treats every nullable column as a "schema-evolution absent column"
+# and injects a constant NULL.
+_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = json.dumps({
+ "type": "record",
+ "name": "manifest_entry",
+ "fields": [
+ {"name": "status", "type": "int"},
+ {"name": "snapshot_id", "type": ["null", "long"]},
+ {"name": "sequence_number", "type": ["null", "long"]},
+ {"name": "file_sequence_number", "type": ["null", "long"]},
+ {
+ "name": "data_file",
+ "type": {
+ "type": "record",
+ "name": "r2",
+ "fields": [
+ {"name": "content", "type": "int"},
+ {"name": "file_path", "type": "string"},
+ {"name": "file_format", "type": "string"},
+ {
+ "name": "partition",
+ "type": {"type": "record", "name": "r102", "fields": []},
+ },
+ {"name": "record_count", "type": "long"},
+ {"name": "file_size_in_bytes", "type": "long"},
+ ],
+ },
+ },
+ ],
+})
+
+
+def _write_avro(schema, records, path, metadata=None):
+ with open(path, "wb") as f:
+ writer = avro.datafile.DataFileWriter(f, avro.io.DatumWriter(), schema)
+ if metadata:
+ for k, v in metadata.items():
+ writer.set_meta(k, v if isinstance(v, bytes) else v.encode("utf-8"))
+ for rec in records:
+ writer.append(rec)
+ writer.close()
+
+
+def _create_stats_less_iceberg_table(tmpdir, table_name, container_base):
+ """
+ Build a minimal Iceberg v2 table under tmpdir/table_name. The manifest is
+ written with a stats-less Avro schema to trigger the optimization bug.
+
+ container_base is the absolute path inside the ClickHouse container where the
+ table will be placed after upload. File paths embedded in the Avro records and
+ metadata.json must reference container_base because they are interpreted at
+ query time inside the container.
+
+ Returns the local table directory path (tmpdir/table_name).
+ """
+ table_local = os.path.join(tmpdir, table_name)
+ data_local = os.path.join(table_local, "data")
+ meta_local = os.path.join(table_local, "metadata")
+ os.makedirs(data_local)
+ os.makedirs(meta_local)
+
+ # Parquet data file with two nullable columns.
+ arrow_schema = pa.schema([
+ pa.field("id", pa.int32(), nullable=True),
+ pa.field("data", pa.string(), nullable=True),
+ ])
+ pq.write_table(
+ pa.table(
+ {
+ "id": pa.array([1, 2, 3], type=pa.int32()),
+ "data": pa.array(["hello", "world", "iceberg"], type=pa.string()),
+ },
+ schema=arrow_schema,
+ ),
+ os.path.join(data_local, "00000-0-data.parquet"),
+ )
+ data_size = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet"))
+ data_container_path = f"{container_base}/data/00000-0-data.parquet"
+
+ snapshot_id = 1
+ seq_number = 1
+ ts_ms = int(time.time() * 1000)
+
+ # Iceberg schema and partition-spec to embed as Avro file-level metadata.
+ # ManifestFile.cpp requires both keys to be present in the manifest's Avro header.
+ iceberg_schema = {
+ "type": "struct",
+ "schema-id": 0,
+ "fields": [
+ {"id": 1, "name": "id", "required": False, "type": "int"},
+ {"id": 2, "name": "data", "required": False, "type": "string"},
+ ],
+ }
+
+ # Manifest file (stats-less Avro schema).
+ manifest_local_path = os.path.join(meta_local, "00000-0-manifest.avro")
+ manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro"
+ _write_avro(
+ avro.schema.parse(_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR),
+ [{
+ "status": 1, # ADDED
+ "snapshot_id": snapshot_id,
+ "sequence_number": seq_number,
+ "file_sequence_number": seq_number,
+ "data_file": {
+ "content": 0, # DATA
+ "file_path": data_container_path,
+ "file_format": "PARQUET",
+ "partition": {},
+ "record_count": 3,
+ "file_size_in_bytes": data_size,
+ },
+ }],
+ manifest_local_path,
+ metadata={
+ "schema": json.dumps(iceberg_schema),
+ "partition-spec": "[]",
+ },
+ )
+ manifest_size = os.path.getsize(manifest_local_path)
+
+ # Manifest list.
+ mlist_filename = f"snap-{snapshot_id}-0-manifest-list.avro"
+ mlist_local_path = os.path.join(meta_local, mlist_filename)
+ mlist_container_path = f"{container_base}/metadata/{mlist_filename}"
+ _write_avro(
+ avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR),
+ [{
+ "manifest_path": manifest_container_path,
+ "manifest_length": manifest_size,
+ "partition_spec_id": 0,
+ "content": 0, # DATA
+ "sequence_number": seq_number,
+ "min_sequence_number": seq_number,
+ "added_snapshot_id": snapshot_id,
+ "added_files_count": 1,
+ "existing_files_count": 0,
+ "deleted_files_count": 0,
+ "added_rows_count": 3,
+ "existing_rows_count": 0,
+ "deleted_rows_count": 0,
+ }],
+ mlist_local_path,
+ )
+
+ # Table metadata JSON.
+ metadata = {
+ "format-version": 2,
+ "table-uuid": str(uuid.uuid4()),
+ "location": container_base,
+ "last-sequence-number": seq_number,
+ "last-updated-ms": ts_ms,
+ "last-column-id": 2,
+ "current-schema-id": 0,
+ "schemas": [{
+ "type": "struct",
+ "schema-id": 0,
+ "fields": [
+ {"id": 1, "name": "id", "required": False, "type": "int"},
+ {"id": 2, "name": "data", "required": False, "type": "string"},
+ ],
+ }],
+ "default-spec-id": 0,
+ "partition-specs": [{"spec-id": 0, "fields": []}],
+ "last-partition-id": 999,
+ "default-sort-order-id": 0,
+ "sort-orders": [{"order-id": 0, "fields": []}],
+ "properties": {},
+ "current-snapshot-id": snapshot_id,
+ "snapshots": [{
+ "snapshot-id": snapshot_id,
+ "sequence-number": seq_number,
+ "timestamp-ms": ts_ms,
+ "manifest-list": mlist_container_path,
+ "summary": {"operation": "append"},
+ "schema-id": 0,
+ }],
+ "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}],
+ "metadata-log": [],
+ "refs": {"main": {"snapshot-id": snapshot_id, "type": "branch"}},
+ }
+ with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f:
+ json.dump(metadata, f, indent=2)
+
+ return table_local
+
+
+def _upload_to_container(instance, local_table_dir, container_base):
+ uploader = LocalUploader(instance)
+ for root, _dirs, files in os.walk(local_table_dir):
+ for fname in files:
+ local_path = os.path.join(root, fname)
+ rel = os.path.relpath(local_path, local_table_dir)
+ uploader.upload_file(local_path, os.path.join(container_base, rel))
+
+
+# Manifest entry schema with value_counts only (partial stats).
+# column_sizes and null_value_counts are absent. value_counts alone is enough to
+# populate columns_info so the absent-NULL guard (columns_info.empty()) is not triggered.
+_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR = json.dumps({
+ "type": "record",
+ "name": "manifest_entry",
+ "fields": [
+ {"name": "status", "type": "int"},
+ {"name": "snapshot_id", "type": ["null", "long"]},
+ {"name": "sequence_number", "type": ["null", "long"]},
+ {"name": "file_sequence_number", "type": ["null", "long"]},
+ {
+ "name": "data_file",
+ "type": {
+ "type": "record",
+ "name": "r2",
+ "fields": [
+ {"name": "content", "type": "int"},
+ {"name": "file_path", "type": "string"},
+ {"name": "file_format", "type": "string"},
+ {
+ "name": "partition",
+ "type": {"type": "record", "name": "r102", "fields": []},
+ },
+ {"name": "record_count", "type": "long"},
+ {"name": "file_size_in_bytes", "type": "long"},
+ # Only value_counts is present (column_sizes and null_value_counts absent).
+ {
+ "name": "value_counts",
+ "type": {
+ "type": "array",
+ "items": {
+ "type": "record",
+ "name": "k81v81",
+ "fields": [
+ {"name": "key", "type": "int"},
+ {"name": "value", "type": "long"},
+ ],
+ },
+ },
+ },
+ ],
+ },
+ },
+ ],
+})
+
+# Manifest entry schema with all three stats fields (value_counts, column_sizes,
+# null_value_counts). Represents a manifest written by a full-stats writer such as Spark.
+_MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR = json.dumps({
+ "type": "record",
+ "name": "manifest_entry",
+ "fields": [
+ {"name": "status", "type": "int"},
+ {"name": "snapshot_id", "type": ["null", "long"]},
+ {"name": "sequence_number", "type": ["null", "long"]},
+ {"name": "file_sequence_number", "type": ["null", "long"]},
+ {
+ "name": "data_file",
+ "type": {
+ "type": "record",
+ "name": "r2",
+ "fields": [
+ {"name": "content", "type": "int"},
+ {"name": "file_path", "type": "string"},
+ {"name": "file_format", "type": "string"},
+ {
+ "name": "partition",
+ "type": {"type": "record", "name": "r102", "fields": []},
+ },
+ {"name": "record_count", "type": "long"},
+ {"name": "file_size_in_bytes", "type": "long"},
+ {
+ "name": "column_sizes",
+ "type": {
+ "type": "array",
+ "items": {
+ "type": "record",
+ "name": "k81v81",
+ "fields": [
+ {"name": "key", "type": "int"},
+ {"name": "value", "type": "long"},
+ ],
+ },
+ },
+ },
+ {
+ "name": "value_counts",
+ "type": {
+ "type": "array",
+ "items": {
+ "type": "record",
+ "name": "k81v81_vc",
+ "fields": [
+ {"name": "key", "type": "int"},
+ {"name": "value", "type": "long"},
+ ],
+ },
+ },
+ },
+ {
+ "name": "null_value_counts",
+ "type": {
+ "type": "array",
+ "items": {
+ "type": "record",
+ "name": "k81v81_nc",
+ "fields": [
+ {"name": "key", "type": "int"},
+ {"name": "value", "type": "long"},
+ ],
+ },
+ },
+ },
+ ],
+ },
+ },
+ ],
+})
+
+
+def _create_iceberg_table_with_schema(tmpdir, table_name, container_base,
+ manifest_schema_str, manifest_extra_fields=None):
+ """
+ Build a minimal Iceberg v2 table. manifest_extra_fields is added to each
+ manifest entry's data_file record when using schemas that include stats.
+ """
+ table_local = os.path.join(tmpdir, table_name)
+ data_local = os.path.join(table_local, "data")
+ meta_local = os.path.join(table_local, "metadata")
+ os.makedirs(data_local)
+ os.makedirs(meta_local)
+
+ arrow_schema = pa.schema([
+ pa.field("id", pa.int32(), nullable=True),
+ pa.field("data", pa.string(), nullable=True),
+ ])
+ pq.write_table(
+ pa.table(
+ {
+ "id": pa.array([1, 2, 3], type=pa.int32()),
+ "data": pa.array(["hello", "world", "iceberg"], type=pa.string()),
+ },
+ schema=arrow_schema,
+ ),
+ os.path.join(data_local, "00000-0-data.parquet"),
+ )
+ data_size = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet"))
+ data_container_path = f"{container_base}/data/00000-0-data.parquet"
+
+ snapshot_id = 1
+ seq_number = 1
+ ts_ms = int(time.time() * 1000)
+
+ iceberg_schema = {
+ "type": "struct",
+ "schema-id": 0,
+ "fields": [
+ {"id": 1, "name": "id", "required": False, "type": "int"},
+ {"id": 2, "name": "data", "required": False, "type": "string"},
+ ],
+ }
+
+ data_file_record = {
+ "content": 0,
+ "file_path": data_container_path,
+ "file_format": "PARQUET",
+ "partition": {},
+ "record_count": 3,
+ "file_size_in_bytes": data_size,
+ }
+ if manifest_extra_fields:
+ data_file_record.update(manifest_extra_fields)
+
+ manifest_local_path = os.path.join(meta_local, "00000-0-manifest.avro")
+ manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro"
+ _write_avro(
+ avro.schema.parse(manifest_schema_str),
+ [{
+ "status": 1,
+ "snapshot_id": snapshot_id,
+ "sequence_number": seq_number,
+ "file_sequence_number": seq_number,
+ "data_file": data_file_record,
+ }],
+ manifest_local_path,
+ metadata={
+ "schema": json.dumps(iceberg_schema),
+ "partition-spec": "[]",
+ },
+ )
+ manifest_size = os.path.getsize(manifest_local_path)
+
+ mlist_filename = f"snap-{snapshot_id}-0-manifest-list.avro"
+ mlist_local_path = os.path.join(meta_local, mlist_filename)
+ mlist_container_path = f"{container_base}/metadata/{mlist_filename}"
+ _write_avro(
+ avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR),
+ [{
+ "manifest_path": manifest_container_path,
+ "manifest_length": manifest_size,
+ "partition_spec_id": 0,
+ "content": 0,
+ "sequence_number": seq_number,
+ "min_sequence_number": seq_number,
+ "added_snapshot_id": snapshot_id,
+ "added_files_count": 1,
+ "existing_files_count": 0,
+ "deleted_files_count": 0,
+ "added_rows_count": 3,
+ "existing_rows_count": 0,
+ "deleted_rows_count": 0,
+ }],
+ mlist_local_path,
+ )
+
+ metadata = {
+ "format-version": 2,
+ "table-uuid": str(uuid.uuid4()),
+ "location": container_base,
+ "last-sequence-number": seq_number,
+ "last-updated-ms": ts_ms,
+ "last-column-id": 2,
+ "current-schema-id": 0,
+ "schemas": [{
+ "type": "struct",
+ "schema-id": 0,
+ "fields": [
+ {"id": 1, "name": "id", "required": False, "type": "int"},
+ {"id": 2, "name": "data", "required": False, "type": "string"},
+ ],
+ }],
+ "default-spec-id": 0,
+ "partition-specs": [{"spec-id": 0, "fields": []}],
+ "last-partition-id": 999,
+ "default-sort-order-id": 0,
+ "sort-orders": [{"order-id": 0, "fields": []}],
+ "properties": {},
+ "current-snapshot-id": snapshot_id,
+ "snapshots": [{
+ "snapshot-id": snapshot_id,
+ "sequence-number": seq_number,
+ "timestamp-ms": ts_ms,
+ "manifest-list": mlist_container_path,
+ "summary": {"operation": "append"},
+ "schema-id": 0,
+ }],
+ "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}],
+ "metadata-log": [],
+ "refs": {"main": {"snapshot-id": snapshot_id, "type": "branch"}},
+ }
+ with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f:
+ json.dump(metadata, f, indent=2)
+
+ return table_local
+
+
+def test_iceberg_local_returns_actual_rows_with_stats_less_manifest(
+ started_cluster_iceberg_no_spark,
+):
+ """
+ FAILS on unpatched code (Altinity#1545).
+
+ With allow_experimental_iceberg_read_optimization=1 (the default), reading an
+ icebergLocal table whose manifest omits column statistics returns all-NULL rows
+ instead of real data. The correct output is 3 rows: (1,'hello'), (2,'world'),
+ (3,'iceberg').
+ """
+ instance = started_cluster_iceberg_no_spark.instances["node1"]
+ table_name = "test_opt_on_stats_less_" + get_uuid_str()
+ container_base = (
+ f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
+ )
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base)
+ _upload_to_container(instance, local_dir, container_base)
+
+ result = instance.query(
+ f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+ " ORDER BY id"
+ " SETTINGS allow_experimental_iceberg_read_optimization=1"
+ ).strip()
+
+ assert result == "1\thello\n2\tworld\n3\ticeberg", (
+ f"Got: {result!r}\n"
+ "All-NULL rows means the bug fired: empty columns_infos caused the "
+ "optimization to treat every nullable column as an absent schema-evolution "
+ "column and inject constant NULL (need_only_count=1)."
+ )
+
+
+def test_iceberg_local_returns_correct_rows_when_optimization_disabled(
+ started_cluster_iceberg_no_spark,
+):
+ """
+ PASSES on unpatched code.
+
+ With allow_experimental_iceberg_read_optimization=0 the optimization block is
+ skipped entirely and the Parquet file is read normally. This test is a
+ regression guard: it must keep passing after the bug is fixed.
+ """
+ instance = started_cluster_iceberg_no_spark.instances["node1"]
+ table_name = "test_opt_off_stats_less_" + get_uuid_str()
+ container_base = (
+ f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
+ )
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base)
+ _upload_to_container(instance, local_dir, container_base)
+
+ result = instance.query(
+ f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+ " ORDER BY id"
+ " SETTINGS allow_experimental_iceberg_read_optimization=0"
+ ).strip()
+
+ assert result == "1\thello\n2\tworld\n3\ticeberg"
+
+
+def test_iceberg_local_partial_stats_manifest_reads_correctly(
+ started_cluster_iceberg_no_spark,
+):
+ """
+ Regression test for Altinity#1545: partial stats (value_counts only).
+
+ A manifest that includes value_counts but omits column_sizes and
+ null_value_counts still populates columns_info. The fix must treat the
+ columns as real (not absent), so real data is returned.
+ """
+ instance = started_cluster_iceberg_no_spark.instances["node1"]
+ table_name = "test_opt_on_partial_stats_" + get_uuid_str()
+ container_base = (
+ f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
+ )
+
+ # value_counts for field_id 1 (id) and 2 (data): 3 rows each.
+ extra = {
+ "value_counts": [
+ {"key": 1, "value": 3},
+ {"key": 2, "value": 3},
+ ],
+ }
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ local_dir = _create_iceberg_table_with_schema(
+ tmpdir, table_name, container_base,
+ _MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR,
+ manifest_extra_fields=extra,
+ )
+ _upload_to_container(instance, local_dir, container_base)
+
+ result = instance.query(
+ f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+ " ORDER BY id"
+ " SETTINGS allow_experimental_iceberg_read_optimization=1"
+ ).strip()
+
+ assert result == "1\thello\n2\tworld\n3\ticeberg", (
+ f"Got: {result!r}\n"
+ "Partial-stats manifest (value_counts only) should not trigger the absent-NULL "
+ "path because columns_info is non-empty. All-NULL rows means the guard "
+ "fired incorrectly for partial-stats manifests."
+ )
+
+
+def test_iceberg_local_full_stats_manifest_reads_correctly(
+ started_cluster_iceberg_no_spark,
+):
+ """
+ Control test: full-stats manifest (Spark-like) must continue to return real data.
+
+ A manifest with all three stats fields (column_sizes, value_counts,
+ null_value_counts) is the common case written by Spark. This test verifies the
+ optimization does not regress for the normal path after the fix.
+ """
+ instance = started_cluster_iceberg_no_spark.instances["node1"]
+ table_name = "test_opt_on_full_stats_" + get_uuid_str()
+ container_base = (
+ f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
+ )
+
+ # Full stats for field_id 1 (id) and 2 (data).
+ extra = {
+ "column_sizes": [
+ {"key": 1, "value": 64},
+ {"key": 2, "value": 128},
+ ],
+ "value_counts": [
+ {"key": 1, "value": 3},
+ {"key": 2, "value": 3},
+ ],
+ "null_value_counts": [
+ {"key": 1, "value": 0},
+ {"key": 2, "value": 0},
+ ],
+ }
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ local_dir = _create_iceberg_table_with_schema(
+ tmpdir, table_name, container_base,
+ _MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR,
+ manifest_extra_fields=extra,
+ )
+ _upload_to_container(instance, local_dir, container_base)
+
+ result = instance.query(
+ f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+ " ORDER BY id"
+ " SETTINGS allow_experimental_iceberg_read_optimization=1"
+ ).strip()
+
+ assert result == "1\thello\n2\tworld\n3\ticeberg", (
+ f"Got: {result!r}\n"
+ "Full-stats manifest should return real data when optimization is enabled."
+ )
From 056eb1a3ec8926a0a1a9541cf767571b4e9219b5 Mon Sep 17 00:00:00 2001
From: "Daniel Q. Kim"
Date: Mon, 22 Jun 2026 14:55:28 +0200
Subject: [PATCH 02/33] Rework Iceberg empty-stats tests per review feedback
Address @zvonand's review of #1895: the tests were poorly scoped. The C++
fix (gate the optimization's absent-NULL loop on `!columns_info.empty()`) is
unchanged; only the integration tests are reworked.
- Drop the `allow_experimental_iceberg_read_optimization=0` test. The fix
only changes behavior when the optimization is enabled, so the disabled
path is unrelated to this change and belongs in a separate PR if at all.
- Add `test_stats_less_manifest_schema_evolution_absent_column`, a
discriminating test where the Iceberg schema declares `id`+`data` but the
Parquet file was written under the older schema and physically contains
only `id`. On the fixed build the stats-less manifest falls through to the
Parquet reader, reads `id` for real and synthesizes the absent `data`
column as NULL via `IcebergMetadata::getInitialSchemaByPath` (the file's
own schema becomes `initial_header`). On the unpatched build `id` is NULL
too, so the result discriminates.
- Add `test_stats_less_manifest_cluster_returns_real_data`, exercising the
cluster JSON serialization path with `icebergLocalCluster`. An empty
`columns_info` must round-trip to the workers and still read real data,
confirming the bug also affected the `*Cluster` variants.
- Add `test_non_empty_stats_absent_column_still_null`, a non-regression
guard that the optimization's legitimate absent-NULL path (a column absent
from a non-empty `columns_info`) keeps returning NULL while present columns
read for real.
- Merge the partial-stats and full-stats cases into one parametrized
`test_non_empty_stats_returns_real_data`.
- Keep the Iceberg fixtures generated in code rather than checked in as
static files (zvonand's readability comment). Each manifest entry and the
`metadata.json` embed per-test runtime values -- the UUID-suffixed
container path, the byte sizes of the freshly written Parquet/Avro files
and a random `table-uuid` -- so they are templates filled at run time, not
static blobs. The one static part, the Avro schemas, is deduplicated into
module constants, and a module-level comment explains the choice.
Verified on the fixed build: all 6 cases pass. With the guard reverted to
run the loop unconditionally, the three stats-less tests fail with all-NULL
rows while the two non-empty-stats guards pass.
Signed-off-by: Daniel Q. Kim
Co-Authored-By: Claude Opus 4.8
---
...t_iceberg_read_optimization_empty_stats.py | 895 ++++++++----------
1 file changed, 380 insertions(+), 515 deletions(-)
diff --git a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
index 34dc2fda2918..fe0c8e6d085f 100644
--- a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
+++ b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
@@ -1,31 +1,48 @@
#!/usr/bin/env python3
"""
-Reproducer for Altinity/ClickHouse#1545:
-icebergLocal() returns all-NULL columns when allow_experimental_iceberg_read_optimization=1
-and the Iceberg manifest has no column statistics.
+Regression coverage for Altinity/ClickHouse#1545:
+the Iceberg read optimization (``allow_experimental_iceberg_read_optimization=1``)
+returned all-NULL columns when the manifest carried no per-file column statistics.
Root cause
----------
-The manifest Avro writer schema intentionally omits the optional stats fields
-(value_counts, column_sizes, null_value_counts, lower_bounds, upper_bounds).
-AvroForIcebergDeserializer.hasPath() therefore returns false for all three stats
-paths in ManifestFile.cpp. columns_infos is left empty.
-
-StorageObjectStorageSource::createReader's second loop (the schema-evolution
-"absent column as NULL" path) iterates over every requested column. Because
-columns_infos is empty, every nullable column is absent → each is injected as a
-constant NULL. With all requested columns constant, need_only_count is set to
-true and the Parquet file is read in count-only mode; the output is the correct
-number of rows, but every value is NULL.
-
-Tests
------
-- test_iceberg_local_returns_actual_rows_with_stats_less_manifest
- FAILS on unpatched code: returns all-NULL rows instead of real data.
-
-- test_iceberg_local_returns_correct_rows_when_optimization_disabled
- PASSES on unpatched code: regression guard — optimization=0 bypasses the
- buggy path and reads the Parquet file normally.
+When a manifest's optional stats fields (``value_counts``, ``column_sizes``,
+``null_value_counts``, ``lower_bounds``, ``upper_bounds``) are all absent --
+common for non-Spark writers such as pyiceberg with default settings --
+``DataFileMetaInfo::columns_info`` is left empty. The optimization's
+absent-column loop in ``StorageObjectStorageSource::createReader`` then iterated
+every requested column, found none in the empty ``columns_info`` map, and
+injected each nullable column as a constant ``NULL``. With every requested
+column constant, ``need_only_count`` became true and the Parquet file was read
+in count-only mode: correct row count, every value ``NULL``.
+
+The fix gates that loop on ``!columns_info.empty()``; an empty map now falls
+through to the Parquet reader, which reads physically-present columns normally
+and synthesizes schema-evolved-absent columns as ``NULL`` via
+``IcebergMetadata::getInitialSchemaByPath``.
+
+Test taxonomy
+-------------
+Discriminating (fail on the unpatched build, pass on the fixed build):
+ - test_stats_less_manifest_returns_real_data
+ - test_stats_less_manifest_schema_evolution_absent_column
+Cluster/serialization path (the cluster JSON round-trip of an empty
+``columns_info``):
+ - test_stats_less_manifest_cluster_returns_real_data
+Non-regression guards (pass on both builds; the optimization's legitimate
+absent-NULL and present-column behavior must keep working when stats exist):
+ - test_non_empty_stats_absent_column_still_null
+ - test_non_empty_stats_returns_real_data
+
+Why the Iceberg fixtures are generated in code rather than checked in as static
+files: every manifest entry and the table ``metadata.json`` embed per-test
+runtime values -- the UUID-suffixed container path, the byte sizes of the
+freshly written Parquet and Avro files, and a random ``table-uuid`` -- so they
+are templates filled at run time, not the static blobs that a checked-in fixture
+(e.g. ``tests/queries/0_stateless/data_minio/.../v1.metadata.json``) would be.
+The one genuinely static part, the Avro schemas, is deduplicated into the
+module-level constants below; the stats-less manifest entry -- the artifact
+under test -- is necessarily produced in code.
"""
import json
@@ -39,11 +56,20 @@
import avro.schema
import pyarrow as pa
import pyarrow.parquet as pq
+import pytest
from helpers.iceberg_utils import get_uuid_str
from helpers.s3_tools import LocalUploader
+# The three rows every fixture stores; ``data`` is omitted from the Parquet file
+# when a test needs a column the file physically lacks.
+_IDS = [1, 2, 3]
+_DATA = ["hello", "world", "iceberg"]
+_REAL_DATA_RESULT = "1\thello\n2\tworld\n3\ticeberg"
+_DATA_NULL_RESULT = "1\t\\N\n2\t\\N\n3\t\\N"
+
+
# Iceberg v2 manifest list Avro schema (minimal fields needed by ClickHouse).
_MANIFEST_LIST_SCHEMA_STR = json.dumps({
"type": "record",
@@ -65,46 +91,84 @@
],
})
-# Manifest entry Avro schema that deliberately omits all per-column stats fields:
-# column_sizes, value_counts, null_value_counts, lower_bounds, upper_bounds.
-#
-# Absent fields → AvroForIcebergDeserializer.hasPath() returns false for each →
-# ManifestFile.cpp skips stats collection → columns_infos is empty →
-# the optimization treats every nullable column as a "schema-evolution absent column"
-# and injects a constant NULL.
-_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = json.dumps({
- "type": "record",
- "name": "manifest_entry",
- "fields": [
- {"name": "status", "type": "int"},
- {"name": "snapshot_id", "type": ["null", "long"]},
- {"name": "sequence_number", "type": ["null", "long"]},
- {"name": "file_sequence_number", "type": ["null", "long"]},
- {
- "name": "data_file",
+
+def _data_file_fields(stats_fields):
+ """data_file record fields, optionally followed by the given stats arrays."""
+ fields = [
+ {"name": "content", "type": "int"},
+ {"name": "file_path", "type": "string"},
+ {"name": "file_format", "type": "string"},
+ {"name": "partition", "type": {"type": "record", "name": "r102", "fields": []}},
+ {"name": "record_count", "type": "long"},
+ {"name": "file_size_in_bytes", "type": "long"},
+ ]
+ for i, name in enumerate(stats_fields):
+ fields.append({
+ "name": name,
"type": {
- "type": "record",
- "name": "r2",
- "fields": [
- {"name": "content", "type": "int"},
- {"name": "file_path", "type": "string"},
- {"name": "file_format", "type": "string"},
- {
- "name": "partition",
- "type": {"type": "record", "name": "r102", "fields": []},
- },
- {"name": "record_count", "type": "long"},
- {"name": "file_size_in_bytes", "type": "long"},
- ],
+ "type": "array",
+ "items": {
+ "type": "record",
+ "name": f"k81v81_{i}",
+ "fields": [
+ {"name": "key", "type": "int"},
+ {"name": "value", "type": "long"},
+ ],
+ },
},
- },
- ],
-})
+ })
+ return fields
-def _write_avro(schema, records, path, metadata=None):
+def _manifest_entry_schema(stats_fields):
+ """Iceberg v2 manifest entry Avro schema carrying the given stats arrays."""
+ return json.dumps({
+ "type": "record",
+ "name": "manifest_entry",
+ "fields": [
+ {"name": "status", "type": "int"},
+ {"name": "snapshot_id", "type": ["null", "long"]},
+ {"name": "sequence_number", "type": ["null", "long"]},
+ {"name": "file_sequence_number", "type": ["null", "long"]},
+ {
+ "name": "data_file",
+ "type": {
+ "type": "record",
+ "name": "r2",
+ "fields": _data_file_fields(stats_fields),
+ },
+ },
+ ],
+ })
+
+
+# Stats-less manifest: omits every optional stats field -> columns_info is empty.
+_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = _manifest_entry_schema([])
+# Partial stats: value_counts only (column_sizes and null_value_counts absent);
+# enough to populate columns_info so the empty-stats guard is not triggered.
+_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR = _manifest_entry_schema(["value_counts"])
+# Full stats: the common Spark-written shape.
+_MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR = _manifest_entry_schema(
+ ["column_sizes", "value_counts", "null_value_counts"]
+)
+
+
+def _schema_fields(with_data):
+ fields = [{"id": 1, "name": "id", "required": False, "type": "int"}]
+ if with_data:
+ fields.append({"id": 2, "name": "data", "required": False, "type": "string"})
+ return fields
+
+
+def _iceberg_schema(schema_id, with_data):
+ return {"type": "struct", "schema-id": schema_id, "fields": _schema_fields(with_data)}
+
+
+def _write_avro(schema_str, records, path, metadata=None):
with open(path, "wb") as f:
- writer = avro.datafile.DataFileWriter(f, avro.io.DatumWriter(), schema)
+ writer = avro.datafile.DataFileWriter(
+ f, avro.io.DatumWriter(), avro.schema.parse(schema_str)
+ )
if metadata:
for k, v in metadata.items():
writer.set_meta(k, v if isinstance(v, bytes) else v.encode("utf-8"))
@@ -113,17 +177,32 @@ def _write_avro(schema, records, path, metadata=None):
writer.close()
-def _create_stats_less_iceberg_table(tmpdir, table_name, container_base):
+def _create_iceberg_table(
+ tmpdir,
+ table_name,
+ container_base,
+ *,
+ manifest_entry_schema_str,
+ file_has_data_column=True,
+ data_file_stats=None,
+ schema_evolution=False,
+):
"""
- Build a minimal Iceberg v2 table under tmpdir/table_name. The manifest is
- written with a stats-less Avro schema to trigger the optimization bug.
+ Build a minimal Iceberg v2 table under ``tmpdir/table_name``.
container_base is the absolute path inside the ClickHouse container where the
- table will be placed after upload. File paths embedded in the Avro records and
- metadata.json must reference container_base because they are interpreted at
- query time inside the container.
-
- Returns the local table directory path (tmpdir/table_name).
+ table is placed after upload; file paths embedded in the Avro records and
+ metadata.json must reference it because they are interpreted at query time
+ inside the container.
+
+ file_has_data_column controls whether the Parquet file physically contains
+ the ``data`` column. data_file_stats, when given, supplies the manifest
+ entry's stats arrays (e.g. ``value_counts``). schema_evolution lays the
+ table out with two schemas (0: ``id`` only, 1: ``id``+``data``) and two
+ snapshots so the data file resolves to the older schema id, exercising the
+ Parquet-reader fall-through that synthesizes the absent ``data`` column.
+
+ Returns the local table directory path.
"""
table_local = os.path.join(tmpdir, table_name)
data_local = os.path.join(table_local, "data")
@@ -131,125 +210,137 @@ def _create_stats_less_iceberg_table(tmpdir, table_name, container_base):
os.makedirs(data_local)
os.makedirs(meta_local)
- # Parquet data file with two nullable columns.
- arrow_schema = pa.schema([
- pa.field("id", pa.int32(), nullable=True),
- pa.field("data", pa.string(), nullable=True),
- ])
+ arrow_fields = [pa.field("id", pa.int32(), nullable=True)]
+ table_data = {"id": pa.array(_IDS, type=pa.int32())}
+ if file_has_data_column:
+ arrow_fields.append(pa.field("data", pa.string(), nullable=True))
+ table_data["data"] = pa.array(_DATA, type=pa.string())
pq.write_table(
- pa.table(
- {
- "id": pa.array([1, 2, 3], type=pa.int32()),
- "data": pa.array(["hello", "world", "iceberg"], type=pa.string()),
- },
- schema=arrow_schema,
- ),
+ pa.table(table_data, schema=pa.schema(arrow_fields)),
os.path.join(data_local, "00000-0-data.parquet"),
)
data_size = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet"))
data_container_path = f"{container_base}/data/00000-0-data.parquet"
- snapshot_id = 1
- seq_number = 1
- ts_ms = int(time.time() * 1000)
+ ts_ms = int(time.time() * 1000)
- # Iceberg schema and partition-spec to embed as Avro file-level metadata.
- # ManifestFile.cpp requires both keys to be present in the manifest's Avro header.
- iceberg_schema = {
- "type": "struct",
- "schema-id": 0,
- "fields": [
- {"id": 1, "name": "id", "required": False, "type": "int"},
- {"id": 2, "name": "data", "required": False, "type": "string"},
- ],
+ # The manifest entry's snapshot_id is the snapshot that wrote the data file.
+ # Under schema evolution that snapshot (id 1) carries the older schema (0),
+ # so the iterator resolves the file to schema 0 while reading the table at
+ # schema 1 -- the condition that makes getInitialSchemaByPath return the
+ # file's own schema as initial_header.
+ entry_snapshot_id = 1
+ write_schema_id = 0 # schema the data file was written with
+
+ data_file_record = {
+ "content": 0, # DATA
+ "file_path": data_container_path,
+ "file_format": "PARQUET",
+ "partition": {},
+ "record_count": len(_IDS),
+ "file_size_in_bytes": data_size,
}
+ if data_file_stats:
+ data_file_record.update(data_file_stats)
- # Manifest file (stats-less Avro schema).
manifest_local_path = os.path.join(meta_local, "00000-0-manifest.avro")
manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro"
_write_avro(
- avro.schema.parse(_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR),
+ manifest_entry_schema_str,
[{
"status": 1, # ADDED
- "snapshot_id": snapshot_id,
- "sequence_number": seq_number,
- "file_sequence_number": seq_number,
- "data_file": {
- "content": 0, # DATA
- "file_path": data_container_path,
- "file_format": "PARQUET",
- "partition": {},
- "record_count": 3,
- "file_size_in_bytes": data_size,
- },
+ "snapshot_id": entry_snapshot_id,
+ "sequence_number": 1,
+ "file_sequence_number": 1,
+ "data_file": data_file_record,
}],
manifest_local_path,
metadata={
- "schema": json.dumps(iceberg_schema),
+ # The manifest's own schema is the data file's write schema.
+ "schema": json.dumps(_iceberg_schema(write_schema_id, file_has_data_column)),
"partition-spec": "[]",
},
)
manifest_size = os.path.getsize(manifest_local_path)
- # Manifest list.
- mlist_filename = f"snap-{snapshot_id}-0-manifest-list.avro"
- mlist_local_path = os.path.join(meta_local, mlist_filename)
- mlist_container_path = f"{container_base}/metadata/{mlist_filename}"
- _write_avro(
- avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR),
- [{
- "manifest_path": manifest_container_path,
- "manifest_length": manifest_size,
- "partition_spec_id": 0,
- "content": 0, # DATA
- "sequence_number": seq_number,
- "min_sequence_number": seq_number,
- "added_snapshot_id": snapshot_id,
- "added_files_count": 1,
- "existing_files_count": 0,
- "deleted_files_count": 0,
- "added_rows_count": 3,
- "existing_rows_count": 0,
- "deleted_rows_count": 0,
- }],
- mlist_local_path,
- )
+ def _write_manifest_list(snapshot_id):
+ filename = f"snap-{snapshot_id}-0-manifest-list.avro"
+ local_path = os.path.join(meta_local, filename)
+ container_path = f"{container_base}/metadata/{filename}"
+ _write_avro(
+ _MANIFEST_LIST_SCHEMA_STR,
+ [{
+ "manifest_path": manifest_container_path,
+ "manifest_length": manifest_size,
+ "partition_spec_id": 0,
+ "content": 0, # DATA
+ "sequence_number": 1,
+ "min_sequence_number": 1,
+ "added_snapshot_id": entry_snapshot_id,
+ "added_files_count": 1,
+ "existing_files_count": 0,
+ "deleted_files_count": 0,
+ "added_rows_count": len(_IDS),
+ "existing_rows_count": 0,
+ "deleted_rows_count": 0,
+ }],
+ local_path,
+ )
+ return container_path
+
+ if schema_evolution:
+ # Two schemas, two snapshots: the data file was written under schema 0
+ # (id only); the table later evolved to schema 1 (id + data).
+ schemas = [_iceberg_schema(0, with_data=False), _iceberg_schema(1, with_data=True)]
+ current_schema = 1
+ last_column_id = 2
+ snapshots = [
+ {"snapshot-id": 1, "schema-id": 0, "manifest-list": _write_manifest_list(1)},
+ {"snapshot-id": 2, "schema-id": 1, "manifest-list": _write_manifest_list(2)},
+ ]
+ current_snapshot = 2
+ last_seq = 2
+ else:
+ # Single schema (id + data); the data file resolves to it directly.
+ schemas = [_iceberg_schema(0, with_data=True)]
+ current_schema = 0
+ last_column_id = 2
+ snapshots = [
+ {"snapshot-id": 1, "schema-id": 0, "manifest-list": _write_manifest_list(1)},
+ ]
+ current_snapshot = 1
+ last_seq = 1
- # Table metadata JSON.
metadata = {
"format-version": 2,
"table-uuid": str(uuid.uuid4()),
"location": container_base,
- "last-sequence-number": seq_number,
+ "last-sequence-number": last_seq,
"last-updated-ms": ts_ms,
- "last-column-id": 2,
- "current-schema-id": 0,
- "schemas": [{
- "type": "struct",
- "schema-id": 0,
- "fields": [
- {"id": 1, "name": "id", "required": False, "type": "int"},
- {"id": 2, "name": "data", "required": False, "type": "string"},
- ],
- }],
+ "last-column-id": last_column_id,
+ "current-schema-id": current_schema,
+ "schemas": schemas,
"default-spec-id": 0,
"partition-specs": [{"spec-id": 0, "fields": []}],
"last-partition-id": 999,
"default-sort-order-id": 0,
"sort-orders": [{"order-id": 0, "fields": []}],
"properties": {},
- "current-snapshot-id": snapshot_id,
- "snapshots": [{
- "snapshot-id": snapshot_id,
- "sequence-number": seq_number,
- "timestamp-ms": ts_ms,
- "manifest-list": mlist_container_path,
- "summary": {"operation": "append"},
- "schema-id": 0,
- }],
- "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}],
+ "current-snapshot-id": current_snapshot,
+ "snapshots": [
+ {
+ "snapshot-id": s["snapshot-id"],
+ "sequence-number": s["snapshot-id"],
+ "timestamp-ms": ts_ms,
+ "manifest-list": s["manifest-list"],
+ "summary": {"operation": "append"},
+ "schema-id": s["schema-id"],
+ }
+ for s in snapshots
+ ],
+ "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": s["snapshot-id"]} for s in snapshots],
"metadata-log": [],
- "refs": {"main": {"snapshot-id": snapshot_id, "type": "branch"}},
+ "refs": {"main": {"snapshot-id": current_snapshot, "type": "branch"}},
}
with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f:
json.dump(metadata, f, indent=2)
@@ -257,7 +348,7 @@ def _create_stats_less_iceberg_table(tmpdir, table_name, container_base):
return table_local
-def _upload_to_container(instance, local_table_dir, container_base):
+def _upload_to_node(instance, local_table_dir, container_base):
uploader = LocalUploader(instance)
for root, _dirs, files in os.walk(local_table_dir):
for fname in files:
@@ -266,420 +357,194 @@ def _upload_to_container(instance, local_table_dir, container_base):
uploader.upload_file(local_path, os.path.join(container_base, rel))
-# Manifest entry schema with value_counts only (partial stats).
-# column_sizes and null_value_counts are absent. value_counts alone is enough to
-# populate columns_info so the absent-NULL guard (columns_info.empty()) is not triggered.
-_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR = json.dumps({
- "type": "record",
- "name": "manifest_entry",
- "fields": [
- {"name": "status", "type": "int"},
- {"name": "snapshot_id", "type": ["null", "long"]},
- {"name": "sequence_number", "type": ["null", "long"]},
- {"name": "file_sequence_number", "type": ["null", "long"]},
- {
- "name": "data_file",
- "type": {
- "type": "record",
- "name": "r2",
- "fields": [
- {"name": "content", "type": "int"},
- {"name": "file_path", "type": "string"},
- {"name": "file_format", "type": "string"},
- {
- "name": "partition",
- "type": {"type": "record", "name": "r102", "fields": []},
- },
- {"name": "record_count", "type": "long"},
- {"name": "file_size_in_bytes", "type": "long"},
- # Only value_counts is present (column_sizes and null_value_counts absent).
- {
- "name": "value_counts",
- "type": {
- "type": "array",
- "items": {
- "type": "record",
- "name": "k81v81",
- "fields": [
- {"name": "key", "type": "int"},
- {"name": "value", "type": "long"},
- ],
- },
- },
- },
- ],
- },
- },
- ],
-})
-
-# Manifest entry schema with all three stats fields (value_counts, column_sizes,
-# null_value_counts). Represents a manifest written by a full-stats writer such as Spark.
-_MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR = json.dumps({
- "type": "record",
- "name": "manifest_entry",
- "fields": [
- {"name": "status", "type": "int"},
- {"name": "snapshot_id", "type": ["null", "long"]},
- {"name": "sequence_number", "type": ["null", "long"]},
- {"name": "file_sequence_number", "type": ["null", "long"]},
- {
- "name": "data_file",
- "type": {
- "type": "record",
- "name": "r2",
- "fields": [
- {"name": "content", "type": "int"},
- {"name": "file_path", "type": "string"},
- {"name": "file_format", "type": "string"},
- {
- "name": "partition",
- "type": {"type": "record", "name": "r102", "fields": []},
- },
- {"name": "record_count", "type": "long"},
- {"name": "file_size_in_bytes", "type": "long"},
- {
- "name": "column_sizes",
- "type": {
- "type": "array",
- "items": {
- "type": "record",
- "name": "k81v81",
- "fields": [
- {"name": "key", "type": "int"},
- {"name": "value", "type": "long"},
- ],
- },
- },
- },
- {
- "name": "value_counts",
- "type": {
- "type": "array",
- "items": {
- "type": "record",
- "name": "k81v81_vc",
- "fields": [
- {"name": "key", "type": "int"},
- {"name": "value", "type": "long"},
- ],
- },
- },
- },
- {
- "name": "null_value_counts",
- "type": {
- "type": "array",
- "items": {
- "type": "record",
- "name": "k81v81_nc",
- "fields": [
- {"name": "key", "type": "int"},
- {"name": "value", "type": "long"},
- ],
- },
- },
- },
- ],
- },
- },
- ],
-})
+def _container_base(table_name):
+ return f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
-def _create_iceberg_table_with_schema(tmpdir, table_name, container_base,
- manifest_schema_str, manifest_extra_fields=None):
- """
- Build a minimal Iceberg v2 table. manifest_extra_fields is added to each
- manifest entry's data_file record when using schemas that include stats.
- """
- table_local = os.path.join(tmpdir, table_name)
- data_local = os.path.join(table_local, "data")
- meta_local = os.path.join(table_local, "metadata")
- os.makedirs(data_local)
- os.makedirs(meta_local)
-
- arrow_schema = pa.schema([
- pa.field("id", pa.int32(), nullable=True),
- pa.field("data", pa.string(), nullable=True),
- ])
- pq.write_table(
- pa.table(
- {
- "id": pa.array([1, 2, 3], type=pa.int32()),
- "data": pa.array(["hello", "world", "iceberg"], type=pa.string()),
- },
- schema=arrow_schema,
- ),
- os.path.join(data_local, "00000-0-data.parquet"),
- )
- data_size = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet"))
- data_container_path = f"{container_base}/data/00000-0-data.parquet"
+def _query_local(instance, container_base, optimization=1):
+ return instance.query(
+ f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+ " ORDER BY id"
+ f" SETTINGS allow_experimental_iceberg_read_optimization={optimization}"
+ ).strip()
- snapshot_id = 1
- seq_number = 1
- ts_ms = int(time.time() * 1000)
- iceberg_schema = {
- "type": "struct",
- "schema-id": 0,
- "fields": [
- {"id": 1, "name": "id", "required": False, "type": "int"},
- {"id": 2, "name": "data", "required": False, "type": "string"},
- ],
- }
+def test_stats_less_manifest_returns_real_data(started_cluster_iceberg_no_spark):
+ """
+ Discriminating reproducer for Altinity#1545.
- data_file_record = {
- "content": 0,
- "file_path": data_container_path,
- "file_format": "PARQUET",
- "partition": {},
- "record_count": 3,
- "file_size_in_bytes": data_size,
- }
- if manifest_extra_fields:
- data_file_record.update(manifest_extra_fields)
+ A stats-less manifest (empty columns_info) must read real data, not all-NULL
+ rows. Unpatched build: every nullable column is injected as a constant NULL
+ and the result is 3 all-NULL rows. Fixed build: the empty-stats guard falls
+ through to the Parquet reader and returns the real values.
+ """
+ instance = started_cluster_iceberg_no_spark.instances["node1"]
+ table_name = "test_stats_less_real_" + get_uuid_str()
+ container_base = _container_base(table_name)
- manifest_local_path = os.path.join(meta_local, "00000-0-manifest.avro")
- manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro"
- _write_avro(
- avro.schema.parse(manifest_schema_str),
- [{
- "status": 1,
- "snapshot_id": snapshot_id,
- "sequence_number": seq_number,
- "file_sequence_number": seq_number,
- "data_file": data_file_record,
- }],
- manifest_local_path,
- metadata={
- "schema": json.dumps(iceberg_schema),
- "partition-spec": "[]",
- },
- )
- manifest_size = os.path.getsize(manifest_local_path)
+ with tempfile.TemporaryDirectory() as tmpdir:
+ local_dir = _create_iceberg_table(
+ tmpdir, table_name, container_base,
+ manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
+ )
+ _upload_to_node(instance, local_dir, container_base)
- mlist_filename = f"snap-{snapshot_id}-0-manifest-list.avro"
- mlist_local_path = os.path.join(meta_local, mlist_filename)
- mlist_container_path = f"{container_base}/metadata/{mlist_filename}"
- _write_avro(
- avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR),
- [{
- "manifest_path": manifest_container_path,
- "manifest_length": manifest_size,
- "partition_spec_id": 0,
- "content": 0,
- "sequence_number": seq_number,
- "min_sequence_number": seq_number,
- "added_snapshot_id": snapshot_id,
- "added_files_count": 1,
- "existing_files_count": 0,
- "deleted_files_count": 0,
- "added_rows_count": 3,
- "existing_rows_count": 0,
- "deleted_rows_count": 0,
- }],
- mlist_local_path,
+ result = _query_local(instance, container_base)
+ assert result == _REAL_DATA_RESULT, (
+ f"Got: {result!r}\n"
+ "All-NULL rows mean the bug fired: empty columns_info caused the "
+ "optimization to treat every nullable column as absent and inject "
+ "constant NULL (need_only_count=1)."
)
- metadata = {
- "format-version": 2,
- "table-uuid": str(uuid.uuid4()),
- "location": container_base,
- "last-sequence-number": seq_number,
- "last-updated-ms": ts_ms,
- "last-column-id": 2,
- "current-schema-id": 0,
- "schemas": [{
- "type": "struct",
- "schema-id": 0,
- "fields": [
- {"id": 1, "name": "id", "required": False, "type": "int"},
- {"id": 2, "name": "data", "required": False, "type": "string"},
- ],
- }],
- "default-spec-id": 0,
- "partition-specs": [{"spec-id": 0, "fields": []}],
- "last-partition-id": 999,
- "default-sort-order-id": 0,
- "sort-orders": [{"order-id": 0, "fields": []}],
- "properties": {},
- "current-snapshot-id": snapshot_id,
- "snapshots": [{
- "snapshot-id": snapshot_id,
- "sequence-number": seq_number,
- "timestamp-ms": ts_ms,
- "manifest-list": mlist_container_path,
- "summary": {"operation": "append"},
- "schema-id": 0,
- }],
- "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}],
- "metadata-log": [],
- "refs": {"main": {"snapshot-id": snapshot_id, "type": "branch"}},
- }
- with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f:
- json.dump(metadata, f, indent=2)
-
- return table_local
-
-def test_iceberg_local_returns_actual_rows_with_stats_less_manifest(
- started_cluster_iceberg_no_spark,
-):
+def test_stats_less_manifest_schema_evolution_absent_column(started_cluster_iceberg_no_spark):
"""
- FAILS on unpatched code (Altinity#1545).
+ Discriminating test for the schema-evolution fall-through (highest risk).
+
+ The Iceberg schema declares ``id``+``data`` but the Parquet file was written
+ under the older schema and physically contains only ``id``. With the
+ stats-less manifest, the fixed build must fall through to the Parquet reader,
+ read ``id`` for real, and synthesize the absent ``data`` column as NULL via
+ IcebergMetadata::getInitialSchemaByPath (file schema as initial_header).
- With allow_experimental_iceberg_read_optimization=1 (the default), reading an
- icebergLocal table whose manifest omits column statistics returns all-NULL rows
- instead of real data. The correct output is 3 rows: (1,'hello'), (2,'world'),
- (3,'iceberg').
+ Unpatched build: the optimization marks BOTH columns (including ``id``)
+ constant NULL, so ``id`` comes back NULL too -- the discriminator.
"""
- instance = started_cluster_iceberg_no_spark.instances["node1"]
- table_name = "test_opt_on_stats_less_" + get_uuid_str()
- container_base = (
- f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
- )
+ instance = started_cluster_iceberg_no_spark.instances["node1"]
+ table_name = "test_stats_less_evolve_" + get_uuid_str()
+ container_base = _container_base(table_name)
with tempfile.TemporaryDirectory() as tmpdir:
- local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base)
- _upload_to_container(instance, local_dir, container_base)
-
- result = instance.query(
- f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
- " ORDER BY id"
- " SETTINGS allow_experimental_iceberg_read_optimization=1"
- ).strip()
+ local_dir = _create_iceberg_table(
+ tmpdir, table_name, container_base,
+ manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
+ file_has_data_column=False,
+ schema_evolution=True,
+ )
+ _upload_to_node(instance, local_dir, container_base)
- assert result == "1\thello\n2\tworld\n3\ticeberg", (
+ result = _query_local(instance, container_base)
+ assert result == _DATA_NULL_RESULT, (
f"Got: {result!r}\n"
- "All-NULL rows means the bug fired: empty columns_infos caused the "
- "optimization to treat every nullable column as an absent schema-evolution "
- "column and inject constant NULL (need_only_count=1)."
+ "Expected real 'id' with NULL 'data'. If 'id' is also NULL the bug "
+ "fired; if the query errors, the Parquet fall-through did not synthesize "
+ "the schema-evolved-absent column as NULL."
)
-def test_iceberg_local_returns_correct_rows_when_optimization_disabled(
- started_cluster_iceberg_no_spark,
-):
+def test_stats_less_manifest_cluster_returns_real_data(started_cluster_iceberg_no_spark):
"""
- PASSES on unpatched code.
-
- With allow_experimental_iceberg_read_optimization=0 the optimization block is
- skipped entirely and the Parquet file is read normally. This test is a
- regression guard: it must keep passing after the bug is fixed.
+ Cluster/serialization-path coverage for Altinity#1545.
+
+ icebergLocalCluster distributes file-level tasks to workers, serializing
+ DataFileMetaInfo (and its empty columns_info) through the cluster JSON path.
+ An empty columns_info must survive the round-trip and still read real data on
+ the worker, exercising the same fix on the *Cluster variants. The table is
+ uploaded to every node because local storage is read by each worker from its
+ own filesystem.
"""
- instance = started_cluster_iceberg_no_spark.instances["node1"]
- table_name = "test_opt_off_stats_less_" + get_uuid_str()
- container_base = (
- f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
- )
+ cluster = started_cluster_iceberg_no_spark
+ instance = cluster.instances["node1"]
+ table_name = "test_stats_less_cluster_" + get_uuid_str()
+ container_base = _container_base(table_name)
with tempfile.TemporaryDirectory() as tmpdir:
- local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base)
- _upload_to_container(instance, local_dir, container_base)
+ local_dir = _create_iceberg_table(
+ tmpdir, table_name, container_base,
+ manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
+ )
+ for node in cluster.instances.values():
+ _upload_to_node(node, local_dir, container_base)
result = instance.query(
- f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+ f"SELECT * FROM icebergLocalCluster('cluster_simple', local, path='{container_base}', format=Parquet)"
" ORDER BY id"
- " SETTINGS allow_experimental_iceberg_read_optimization=0"
+ " SETTINGS allow_experimental_iceberg_read_optimization=1"
).strip()
-
- assert result == "1\thello\n2\tworld\n3\ticeberg"
+ assert result == _REAL_DATA_RESULT, (
+ f"Got: {result!r}\n"
+ "All-NULL rows mean empty columns_info was mishandled on the worker after "
+ "the cluster JSON round-trip."
+ )
-def test_iceberg_local_partial_stats_manifest_reads_correctly(
- started_cluster_iceberg_no_spark,
-):
+def test_non_empty_stats_absent_column_still_null(started_cluster_iceberg_no_spark):
"""
- Regression test for Altinity#1545: partial stats (value_counts only).
+ Non-regression guard: the optimization's legitimate absent-NULL path.
- A manifest that includes value_counts but omits column_sizes and
- null_value_counts still populates columns_info. The fix must treat the
- columns as real (not absent), so real data is returned.
+ The Iceberg schema declares ``id``+``data`` but the Parquet file contains
+ only ``id``, and the manifest carries stats for ``id`` only (non-empty
+ columns_info). The optimization must keep marking the genuinely-absent
+ ``data`` column NULL while ``id`` reads real values. Passes on both builds;
+ the fix only changes the empty-columns_info case.
"""
- instance = started_cluster_iceberg_no_spark.instances["node1"]
- table_name = "test_opt_on_partial_stats_" + get_uuid_str()
- container_base = (
- f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
- )
+ instance = started_cluster_iceberg_no_spark.instances["node1"]
+ table_name = "test_nonempty_absent_" + get_uuid_str()
+ container_base = _container_base(table_name)
- # value_counts for field_id 1 (id) and 2 (data): 3 rows each.
- extra = {
- "value_counts": [
- {"key": 1, "value": 3},
- {"key": 2, "value": 3},
- ],
- }
+ # value_counts for field_id 1 (id) only -> columns_info = {id}.
+ stats = {"value_counts": [{"key": 1, "value": len(_IDS)}]}
with tempfile.TemporaryDirectory() as tmpdir:
- local_dir = _create_iceberg_table_with_schema(
+ local_dir = _create_iceberg_table(
tmpdir, table_name, container_base,
- _MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR,
- manifest_extra_fields=extra,
+ manifest_entry_schema_str=_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR,
+ file_has_data_column=False,
+ data_file_stats=stats,
)
- _upload_to_container(instance, local_dir, container_base)
+ _upload_to_node(instance, local_dir, container_base)
- result = instance.query(
- f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
- " ORDER BY id"
- " SETTINGS allow_experimental_iceberg_read_optimization=1"
- ).strip()
-
- assert result == "1\thello\n2\tworld\n3\ticeberg", (
+ result = _query_local(instance, container_base)
+ assert result == _DATA_NULL_RESULT, (
f"Got: {result!r}\n"
- "Partial-stats manifest (value_counts only) should not trigger the absent-NULL "
- "path because columns_info is non-empty. All-NULL rows means the guard "
- "fired incorrectly for partial-stats manifests."
+ "With non-empty stats the optimization must still read 'id' and inject "
+ "the absent 'data' column as NULL."
)
-def test_iceberg_local_full_stats_manifest_reads_correctly(
- started_cluster_iceberg_no_spark,
+@pytest.mark.parametrize(
+ "manifest_entry_schema_str,stats",
+ [
+ (
+ _MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR,
+ {"value_counts": [{"key": 1, "value": 3}, {"key": 2, "value": 3}]},
+ ),
+ (
+ _MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR,
+ {
+ "column_sizes": [{"key": 1, "value": 64}, {"key": 2, "value": 128}],
+ "value_counts": [{"key": 1, "value": 3}, {"key": 2, "value": 3}],
+ "null_value_counts": [{"key": 1, "value": 0}, {"key": 2, "value": 0}],
+ },
+ ),
+ ],
+ ids=["partial_stats", "full_stats"],
+)
+def test_non_empty_stats_returns_real_data(
+ started_cluster_iceberg_no_spark, manifest_entry_schema_str, stats
):
"""
- Control test: full-stats manifest (Spark-like) must continue to return real data.
-
- A manifest with all three stats fields (column_sizes, value_counts,
- null_value_counts) is the common case written by Spark. This test verifies the
- optimization does not regress for the normal path after the fix.
+ Non-regression guard: manifests that do populate columns_info (partial
+ value_counts-only, and full Spark-style stats) must keep returning real data
+ when the optimization is enabled. These are the cases the empty-stats guard
+ must not disturb.
"""
- instance = started_cluster_iceberg_no_spark.instances["node1"]
- table_name = "test_opt_on_full_stats_" + get_uuid_str()
- container_base = (
- f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
- )
-
- # Full stats for field_id 1 (id) and 2 (data).
- extra = {
- "column_sizes": [
- {"key": 1, "value": 64},
- {"key": 2, "value": 128},
- ],
- "value_counts": [
- {"key": 1, "value": 3},
- {"key": 2, "value": 3},
- ],
- "null_value_counts": [
- {"key": 1, "value": 0},
- {"key": 2, "value": 0},
- ],
- }
+ instance = started_cluster_iceberg_no_spark.instances["node1"]
+ table_name = "test_nonempty_real_" + get_uuid_str()
+ container_base = _container_base(table_name)
with tempfile.TemporaryDirectory() as tmpdir:
- local_dir = _create_iceberg_table_with_schema(
+ local_dir = _create_iceberg_table(
tmpdir, table_name, container_base,
- _MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR,
- manifest_extra_fields=extra,
+ manifest_entry_schema_str=manifest_entry_schema_str,
+ data_file_stats=stats,
)
- _upload_to_container(instance, local_dir, container_base)
-
- result = instance.query(
- f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
- " ORDER BY id"
- " SETTINGS allow_experimental_iceberg_read_optimization=1"
- ).strip()
+ _upload_to_node(instance, local_dir, container_base)
- assert result == "1\thello\n2\tworld\n3\ticeberg", (
+ result = _query_local(instance, container_base)
+ assert result == _REAL_DATA_RESULT, (
f"Got: {result!r}\n"
- "Full-stats manifest should return real data when optimization is enabled."
+ "A manifest with non-empty stats must return real data when the "
+ "optimization is enabled."
)
From 17ef6e71a7bed981355158186229316af81e1361 Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Mon, 11 May 2026 16:37:40 -0300
Subject: [PATCH 03/33] cast like insert select
---
src/Storages/MergeTree/ExportPartTask.cpp | 29 ++
src/Storages/MergeTree/MergeTreeData.cpp | 23 +-
src/Storages/StorageReplicatedMergeTree.cpp | 23 +-
.../test.py | 263 ++++++++++++++++++
...rge_tree_part_to_object_storage_simple.sql | 9 +-
...rge_tree_part_to_object_storage_simple.sql | 4 +-
6 files changed, 337 insertions(+), 14 deletions(-)
diff --git a/src/Storages/MergeTree/ExportPartTask.cpp b/src/Storages/MergeTree/ExportPartTask.cpp
index 94b25f63ad9f..f2035577bbf6 100644
--- a/src/Storages/MergeTree/ExportPartTask.cpp
+++ b/src/Storages/MergeTree/ExportPartTask.cpp
@@ -99,6 +99,31 @@ namespace
}
}
+ /// Mirrors `InterpreterInsertQuery::addInsertToSelectPipeline`: positional match,
+ /// destination header = `getSampleBlockNonMaterialized()`, all type bridging is done
+ /// by the CAST inside `makeConvertingActions`. No pre-validation, no per-column
+ /// lossy/non-lossy classification — restrictions are exactly what INSERT SELECT enforces.
+ void addExportConvertingActions(
+ QueryPlan & plan_for_part,
+ const IStorage & destination_storage,
+ const ContextPtr & local_context)
+ {
+ const auto destination_header
+ = destination_storage.getInMemoryMetadataPtr()->getSampleBlockNonMaterialized();
+
+ auto dag = ActionsDAG::makeConvertingActions(
+ plan_for_part.getCurrentHeader()->getColumnsWithTypeAndName(),
+ destination_header.getColumnsWithTypeAndName(),
+ ActionsDAG::MatchColumnsMode::Position,
+ local_context);
+
+ auto expression_step = std::make_unique(
+ plan_for_part.getCurrentHeader(),
+ std::move(dag));
+ expression_step->setStepDescription("Convert source columns to destination types for export");
+ plan_for_part.addStep(std::move(expression_step));
+ }
+
String buildDestinationFilename(
const MergeTreePartExportManifest & manifest,
const StorageID & storage_id,
@@ -261,6 +286,10 @@ bool ExportPartTask::executeStep()
/// This is a hack that materializes the columns before the export so they can be exported to tables that have matching columns
materializeSpecialColumns(plan_for_part.getCurrentHeader(), metadata_snapshot, local_context, plan_for_part);
+ /// Align the pipeline header with the destination's non-materialized sample block,
+ /// using the same `makeConvertingActions(Position)` call INSERT SELECT performs.
+ addExportConvertingActions(plan_for_part, *destination_storage, local_context);
+
QueryPlanOptimizationSettings optimization_settings(local_context);
auto pipeline_settings = BuildQueryPipelineSettings(local_context);
auto builder = plan_for_part.buildQueryPipeline(optimization_settings, pipeline_settings);
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index a07bd992066e..9dfb4edf20bc 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -30,6 +30,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -6750,14 +6751,24 @@ void MergeTreeData::exportPartToTable(
#endif
}
- const auto & source_columns = source_metadata_ptr->getColumns();
+ /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` —
+ /// positional matching with CAST. We surface structural mismatches synchronously at
+ /// ALTER TABLE time by building (and discarding) the same converting DAG the worker
+ /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch,
+ /// untyped cast) fails here; anything it accepts will be CAST at runtime.
+ {
+ Block source_sample_block;
+ for (const auto & column : source_metadata_ptr->getColumns().getReadable())
+ source_sample_block.insert({column.type->createColumn(), column.type, column.name});
- const auto & destination_columns = destination_metadata_ptr->getColumns();
+ const auto destination_sample_block = destination_metadata_ptr->getSampleBlockNonMaterialized();
- /// compare all source readable columns with all destination insertable columns
- /// this allows us to skip ephemeral columns
- if (source_columns.getReadable().sizeOfDifference(destination_columns.getInsertable()))
- throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure");
+ (void) ActionsDAG::makeConvertingActions(
+ source_sample_block.getColumnsWithTypeAndName(),
+ destination_sample_block.getColumnsWithTypeAndName(),
+ ActionsDAG::MatchColumnsMode::Position,
+ query_context);
+ }
/// for data lakes this check is performed differently. It is a bit more complex as we need to convert the iceberg partition spec
/// to the MergeTree partition spec and compare the two.
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 7aaecac34832..9104884a2a91 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -79,6 +79,7 @@
#include
#include
#include
+#include
#include
#include
@@ -8387,10 +8388,24 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
auto src_snapshot = getInMemoryMetadataPtr();
auto destination_snapshot = dest_storage->getInMemoryMetadataPtr();
- /// compare all source readable columns with all destination insertable columns
- /// this allows us to skip ephemeral columns
- if (src_snapshot->getColumns().getReadable().sizeOfDifference(destination_snapshot->getColumns().getInsertable()))
- throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure");
+ /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` —
+ /// positional matching with CAST. We surface structural mismatches synchronously at
+ /// ALTER TABLE time by building (and discarding) the same converting DAG the worker
+ /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch,
+ /// untyped cast) fails here; anything it accepts will be CAST at runtime.
+ {
+ Block source_sample_block;
+ for (const auto & column : src_snapshot->getColumns().getReadable())
+ source_sample_block.insert({column.type->createColumn(), column.type, column.name});
+
+ const auto destination_sample_block = destination_snapshot->getSampleBlockNonMaterialized();
+
+ (void) ActionsDAG::makeConvertingActions(
+ source_sample_block.getColumnsWithTypeAndName(),
+ destination_sample_block.getColumnsWithTypeAndName(),
+ ActionsDAG::MatchColumnsMode::Position,
+ query_context);
+ }
/// for data lakes this check is performed later. It is a bit more complex as we need to convert the iceberg partition spec
/// to the MergeTree partition spec and compare the two.
diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
index b371727a08cd..9dc3b2f8434e 100644
--- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
+++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
@@ -121,6 +121,42 @@ def assert_part_log(node, table: str, part: str) -> None:
)
+def wait_for_failed_export_part(
+ node,
+ table: str,
+ part: str,
+ timeout: int = 60,
+ poll_interval: float = 0.5,
+) -> str:
+ """Poll system.part_log until a failed ExportPart event appears for *part*.
+
+ Returns the exception text recorded on the log entry, which lets callers
+ assert on the specific runtime error that propagated from the export worker.
+ """
+ deadline = time.time() + timeout
+ last_seen = ""
+ while time.time() < deadline:
+ node.query("SYSTEM FLUSH LOGS")
+ row = node.query(
+ f"SELECT error, exception FROM system.part_log "
+ f"WHERE event_type = 'ExportPart' "
+ f"AND database = currentDatabase() "
+ f"AND table = '{table}' "
+ f"AND part_name = '{part}' "
+ f"AND error != 0 "
+ f"ORDER BY event_time DESC LIMIT 1"
+ ).strip()
+ if row:
+ _error, exception = row.split("\t", 1)
+ return exception
+ last_seen = row
+ time.sleep(poll_interval)
+ raise TimeoutError(
+ f"Failed ExportPart event for part {part!r} in table {table!r} "
+ f"did not appear in system.part_log within {timeout}s (last row: {last_seen!r})"
+ )
+
+
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
@@ -479,3 +515,230 @@ def test_export_part_writes_column_statistics(cluster):
node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
node.query(f"DROP TABLE IF EXISTS {iceberg}")
+
+
+# ---------------------------------------------------------------------------
+# Schema compatibility (INSERT SELECT mirror)
+#
+# EXPORT PART builds an ActionsDAG via makeConvertingActions(Position) — the
+# same primitive that powers INSERT INTO dest SELECT * FROM src. The tests
+# below pin that contract:
+#
+# * Column counts must match positionally; otherwise the ALTER is rejected
+# synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing lands.
+# * Destination column names need not match source names — positional pairs
+# are CAST element-wise.
+# * Castable type differences are accepted at validation time (widening and
+# even lossy narrowing). Lossiness only matters at runtime: if the value
+# does not fit the destination type, the async export worker fails and the
+# failure surfaces via system.part_log; the Iceberg table is untouched.
+# ---------------------------------------------------------------------------
+
+
+def test_export_part_column_count_mismatch_source_more_is_rejected(cluster):
+ """
+ Source has 3 columns (id, year, extra), destination has 2 (id, year).
+ The ALTER must be rejected synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH
+ and the Iceberg table must remain empty.
+ """
+ node = cluster.instances["node1"]
+ sfx = unique_suffix()
+ mt = f"mt_count_more_{sfx}"
+ iceberg = f"iceberg_count_more_{sfx}"
+
+ make_mt(node, mt, "id Int32, year Int32, extra String", "year")
+ make_iceberg_s3(node, iceberg, "id Int32, year Int32", "year")
+
+ node.query(f"INSERT INTO {mt} VALUES (1, 2020, 'foo'), (2, 2020, 'bar')")
+ part_2020 = get_part(node, mt, "2020")
+
+ error = node.query_and_get_error(
+ f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} "
+ f"SETTINGS allow_experimental_export_merge_tree_part = 1, "
+ f"allow_experimental_insert_into_iceberg = 1"
+ )
+ assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, (
+ f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source>dest column count, "
+ f"got: {error!r}"
+ )
+
+ node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
+ node.query(f"DROP TABLE IF EXISTS {iceberg}")
+
+
+def test_export_part_column_count_mismatch_source_fewer_is_rejected(cluster):
+ """
+ Source has 2 columns (id, year), destination has 3 (id, year, extra).
+ Same expected synchronous rejection as the source>dest case.
+ """
+ node = cluster.instances["node1"]
+ sfx = unique_suffix()
+ mt = f"mt_count_fewer_{sfx}"
+ iceberg = f"iceberg_count_fewer_{sfx}"
+
+ make_mt(node, mt, "id Int32, year Int32", "year")
+ make_iceberg_s3(node, iceberg, "id Int32, year Int32, extra String", "year")
+
+ node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020)")
+ part_2020 = get_part(node, mt, "2020")
+
+ error = node.query_and_get_error(
+ f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} "
+ f"SETTINGS allow_experimental_export_merge_tree_part = 1, "
+ f"allow_experimental_insert_into_iceberg = 1"
+ )
+ assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, (
+ f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source Int32 is a
+ defined conversion) so the ALTER returns synchronously; the actual cast
+ runs in the async export worker and must fail there. The failure is
+ propagated to system.part_log (non-zero `error` column with a non-empty
+ `exception` text) and the Iceberg table is left empty.
+
+ (Integer narrowing overflow is not used here because the internal cast
+ `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps
+ on overflow rather than throwing — that would still produce a successful
+ export with garbage data, not a propagated failure.)
+ """
+ node = cluster.instances["node1"]
+ sfx = unique_suffix()
+ mt = f"mt_runtime_cast_fail_{sfx}"
+ iceberg = f"iceberg_runtime_cast_fail_{sfx}"
+
+ make_mt(node, mt, "id String, year Int32", "year")
+ make_iceberg_s3(node, iceberg, "id Int32, year Int32", "year")
+
+ node.query(f"INSERT INTO {mt} VALUES ('not a number', 2020)")
+ part_2020 = get_part(node, mt, "2020")
+
+ export_part(node, mt, part_2020, iceberg)
+
+ exception = wait_for_failed_export_part(node, mt, part_2020)
+ assert exception, (
+ f"Expected non-empty exception text on failed ExportPart entry, got {exception!r}"
+ )
+
+ count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
+ assert count == 0, (
+ f"Expected 0 rows in Iceberg table after failed export, got {count}"
+ )
+
+ node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
+ node.query(f"DROP TABLE IF EXISTS {iceberg}")
diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
index 7cb70af024a2..ceb83191f93a 100644
--- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
@@ -9,10 +9,12 @@ CREATE TABLE 03572_mt_table (id UInt64, year UInt16) ENGINE = MergeTree() PARTIT
INSERT INTO 03572_mt_table VALUES (1, 2020);
-- Create a table with a different partition key and export a partition to it. It should throw
+-- on the partition-key AST mismatch (schema compat now follows INSERT SELECT positional semantics,
+-- so the column shape matches and the partition-key check is what fires).
CREATE TABLE 03572_invalid_schema_table (id UInt64, x UInt16) ENGINE = S3(s3_conn, filename='03572_invalid_schema_table', format='Parquet', partition_strategy='hive') PARTITION BY x;
ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_invalid_schema_table
-SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError INCOMPATIBLE_COLUMNS}
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
DROP TABLE 03572_invalid_schema_table;
@@ -27,13 +29,14 @@ ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE FUNCTION extractKey
-- It is a table function, but the engine does not support exports/imports, should throw
ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE FUNCTION url('a.parquet'); -- {serverError NOT_IMPLEMENTED}
--- Test that destination table can not have a column that matches the source ephemeral
+-- Source-side ephemeral columns are not readable, so the destination must not declare a matching
+-- ordinary column or the column count will not align under positional matching.
CREATE TABLE 03572_ephemeral_mt_table (id UInt64, year UInt16, name String EPHEMERAL) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
CREATE TABLE 03572_matching_ephemeral_s3_table (id UInt64, year UInt16, name String) ENGINE = S3(s3_conn, filename='03572_matching_ephemeral_s3_table', format='Parquet', partition_strategy='hive') PARTITION BY year;
INSERT INTO 03572_ephemeral_mt_table (id, year, name) VALUES (1, 2020, 'alice');
-ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError INCOMPATIBLE_COLUMNS}
+ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError NUMBER_OF_COLUMNS_DOESNT_MATCH}
DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table;
diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
index f8f23532f0a7..0f6647861e43 100644
--- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
@@ -7,10 +7,12 @@ CREATE TABLE 03572_rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTr
INSERT INTO 03572_rmt_table VALUES (1, 2020);
-- Create a table with a different partition key and export a partition to it. It should throw
+-- on the partition-key AST mismatch (schema compat now follows INSERT SELECT positional semantics,
+-- so the column shape matches and the partition-key check is what fires).
CREATE TABLE 03572_invalid_schema_table (id UInt64, x UInt16) ENGINE = S3(s3_conn, filename='03572_invalid_schema_table', format='Parquet', partition_strategy='hive') PARTITION BY x;
ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table
-SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError INCOMPATIBLE_COLUMNS}
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
DROP TABLE 03572_invalid_schema_table;
From c88f69eb66f5cb5e794995d40e140c903d94da0a Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Fri, 22 May 2026 18:51:08 -0300
Subject: [PATCH 04/33] tmp
---
.../MergeTree/ExportPartitionUtils.cpp | 60 +++++++++-
src/Storages/MergeTree/ExportPartitionUtils.h | 35 +++++-
src/Storages/MergeTree/MergeTreeData.cpp | 22 +++-
src/Storages/StorageReplicatedMergeTree.cpp | 22 +++-
.../test.py | 67 +++++++++++
.../test.py | 66 ++++++++++
.../test.py | 113 ++++++++++++++++++
...rge_tree_part_to_object_storage_simple.sql | 16 ++-
...rge_tree_part_to_object_storage_simple.sql | 14 ++-
9 files changed, 400 insertions(+), 15 deletions(-)
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp
index 4a447c8de3ab..d5df9b6bea2a 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.cpp
+++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp
@@ -403,8 +403,18 @@ namespace ExportPartitionUtils
#if USE_AVRO
void verifyIcebergPartitionCompatibility(
const Poco::JSON::Object::Ptr & metadata_object,
- const ASTPtr & partition_key_ast)
+ const ASTPtr & partition_key_ast,
+ const StorageMetadataPtr & source_metadata,
+ const StorageMetadataPtr & destination_metadata,
+ const StorageID & destination_storage_id)
{
+ /// The Iceberg manifest's partition values are derived from the SOURCE
+ /// part's minmax block (see `IcebergMetadata::commitExportPartitionTransaction`)
+ /// while the row data is CAST to the destination Iceberg schema by
+ /// `addExportConvertingActions`. Reject castable type differences on
+ /// partition-key columns up front so the two cannot disagree.
+ verifyPartitionColumnsExactTypeMatch(source_metadata, destination_metadata, destination_storage_id);
+
const auto original_schema_id = metadata_object->getValue(Iceberg::f_current_schema_id);
const auto partition_spec_id = metadata_object->getValue(Iceberg::f_default_spec_id);
@@ -519,6 +529,54 @@ namespace ExportPartitionUtils
}
}
#endif
+
+ void verifyPartitionColumnsExactTypeMatch(
+ const StorageMetadataPtr & source_metadata,
+ const StorageMetadataPtr & destination_metadata,
+ const StorageID & destination_storage_id)
+ {
+ if (!source_metadata->hasPartitionKey())
+ return;
+
+ /// Use the partition expression's required columns (the ones that flow
+ /// into the destination's partition strategy via
+ /// `minmax_idx->getBlock(storage)`), not the partition expression's
+ /// sample-block output columns. This catches both identity
+ /// partitioning (`PARTITION BY year`) and computed partitioning
+ /// (`PARTITION BY toYearNumSinceEpoch(event_date)` — required column
+ /// is `event_date`).
+ const auto required_columns
+ = source_metadata->getPartitionKey().expression->getRequiredColumnsWithTypes();
+
+ const auto & destination_columns = destination_metadata->getColumns();
+
+ for (const auto & source_column : required_columns)
+ {
+ if (!destination_columns.has(source_column.name))
+ throw Exception(ErrorCodes::BAD_ARGUMENTS,
+ "Cannot export partition: source partition key references column '{}' "
+ "which does not exist on the destination table {}. "
+ "EXPORT does not re-evaluate the partition key against the destination's schema, "
+ "so every column the source partition expression depends on must be present "
+ "on the destination with the same type.",
+ source_column.name, destination_storage_id.getFullTableName());
+
+ const auto destination_type
+ = destination_columns.getPhysical(source_column.name).type;
+ if (!source_column.type->equals(*destination_type))
+ throw Exception(ErrorCodes::BAD_ARGUMENTS,
+ "Cannot export partition: partition key column '{}' has type {} on the source "
+ "but {} on the destination table {}. "
+ "EXPORT does not re-evaluate the partition key against the destination's schema, "
+ "so castable type differences are not allowed for partition columns "
+ "(the partition path and any Iceberg partition values are derived from the "
+ "source-typed minmax block, while the row data is CAST to the destination type).",
+ source_column.name,
+ source_column.type->getName(),
+ destination_type->getName(),
+ destination_storage_id.getFullTableName());
+ }
+ }
}
}
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h
index eb67d288d71e..958a8a819457 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.h
+++ b/src/Storages/MergeTree/ExportPartitionUtils.h
@@ -7,6 +7,7 @@
#include
#include
#include "Storages/IStorage.h"
+#include
#include
#if USE_AVRO
@@ -88,13 +89,41 @@ namespace ExportPartitionUtils
const std::string & exception_message,
const LoggerPtr & log);
+ /// EXPORT PART/PARTITION never re-evaluates the partition key against the
+ /// destination's schema: the partition path (object storage) and the
+ /// Iceberg partition values (data lake) are derived from the SOURCE part's
+ /// minmax block in ExportPartTask::executeStep. Therefore every column the
+ /// SOURCE partition expression depends on must have an identically-typed
+ /// counterpart on the destination — castable but non-equal types are not
+ /// safe here even though `makeConvertingActions` would accept them.
+ ///
+ /// Called directly from the non-data-lake branch of
+ /// `MergeTreeData::exportPartToTable` /
+ /// `StorageReplicatedMergeTree::exportPartitionToTable`, and indirectly
+ /// for the Iceberg branch via `verifyIcebergPartitionCompatibility` which
+ /// embeds this check. Throws BAD_ARGUMENTS on missing column or type
+ /// mismatch.
+ void verifyPartitionColumnsExactTypeMatch(
+ const StorageMetadataPtr & source_metadata,
+ const StorageMetadataPtr & destination_metadata,
+ const StorageID & destination_storage_id);
+
#if USE_AVRO
/// Verifies that the source MergeTree partition key is compatible with the
- /// destination Iceberg partition spec by comparing field source-ids and
- /// transforms in order. Throws BAD_ARGUMENTS if they do not match.
+ /// destination Iceberg partition spec by:
+ /// * Comparing partition-field source-ids and transforms in order
+ /// against the Iceberg spec derived from the MergeTree PARTITION BY AST.
+ /// * Embedding `verifyPartitionColumnsExactTypeMatch` so that columns
+ /// flowing through the partition key have identical ClickHouse types on
+ /// both sides — the Iceberg manifest's partition values come from the
+ /// SOURCE-typed minmax block and cannot be reconciled later.
+ /// Throws BAD_ARGUMENTS on any mismatch.
void verifyIcebergPartitionCompatibility(
const Poco::JSON::Object::Ptr & metadata_object,
- const ASTPtr & partition_key_ast);
+ const ASTPtr & partition_key_ast,
+ const StorageMetadataPtr & source_metadata,
+ const StorageMetadataPtr & destination_metadata,
+ const StorageID & destination_storage_id);
#endif
}
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 9dfb4edf20bc..ee2b7d2250c8 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -6743,7 +6743,12 @@ void MergeTreeData::exportPartToTable(
metadata_object->stringify(oss);
iceberg_metadata_json = oss.str();
- ExportPartitionUtils::verifyIcebergPartitionCompatibility(metadata_object, source_metadata_ptr->getPartitionKeyAST());
+ ExportPartitionUtils::verifyIcebergPartitionCompatibility(
+ metadata_object,
+ source_metadata_ptr->getPartitionKeyAST(),
+ source_metadata_ptr,
+ destination_metadata_ptr,
+ dest_storage->getStorageID());
}
#else
(void)iceberg_metadata_json_;
@@ -6770,12 +6775,23 @@ void MergeTreeData::exportPartToTable(
query_context);
}
- /// for data lakes this check is performed differently. It is a bit more complex as we need to convert the iceberg partition spec
- /// to the MergeTree partition spec and compare the two.
+ /// For data lakes (Iceberg) the partition-key checks — both spec/transform
+ /// compatibility and ClickHouse partition-column exact-type equality — are
+ /// performed inside `verifyIcebergPartitionCompatibility` above.
+ /// For non-data-lake destinations we do the AST equivalence check here and
+ /// then run the partition-column exact-type check directly.
if (!dest_storage->isDataLake())
{
if (query_to_string(source_metadata_ptr->getPartitionKeyAST()) != query_to_string(destination_metadata_ptr->getPartitionKeyAST()))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key");
+
+ /// EXPORT does not re-evaluate the partition key against the destination's schema:
+ /// the hive partition path is derived from the SOURCE part's minmax block in
+ /// `ExportPartTask::executeStep`, while the row data is CAST to destination types
+ /// by `addExportConvertingActions`. Reject castable type differences on
+ /// partition-key columns synchronously so the path and the row data cannot disagree.
+ ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch(
+ source_metadata_ptr, destination_metadata_ptr, dest_storage->getStorageID());
}
auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated});
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 9104884a2a91..cb576f137546 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -8407,14 +8407,24 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
query_context);
}
- /// for data lakes this check is performed later. It is a bit more complex as we need to convert the iceberg partition spec
- /// to the MergeTree partition spec and compare the two.
+ /// For data lakes (Iceberg) the partition-key checks — both spec/transform
+ /// compatibility and ClickHouse partition-column exact-type equality — are
+ /// performed inside `verifyIcebergPartitionCompatibility` below (per partition).
+ /// For non-data-lake destinations we do the AST equivalence check here and
+ /// then run the partition-column exact-type check directly.
if (!dest_storage->isDataLake())
{
if (query_to_string(src_snapshot->getPartitionKeyAST()) != query_to_string(destination_snapshot->getPartitionKeyAST()))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key");
+
+ /// EXPORT does not re-evaluate the partition key against the destination's schema:
+ /// the hive partition path is derived from the SOURCE part's minmax block in
+ /// `ExportPartTask::executeStep`, while the row data is CAST to destination types
+ /// by `addExportConvertingActions`. Reject castable type differences on
+ /// partition-key columns synchronously so the path and the row data cannot disagree.
+ ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch(
+ src_snapshot, destination_snapshot, dest_storage->getStorageID());
}
-
zkutil::ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly();
@@ -8586,7 +8596,11 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
const auto metadata_object = iceberg_metadata->getMetadataJSON(query_context);
ExportPartitionUtils::verifyIcebergPartitionCompatibility(
- metadata_object, src_snapshot->getPartitionKeyAST());
+ metadata_object,
+ src_snapshot->getPartitionKeyAST(),
+ src_snapshot,
+ destination_snapshot,
+ dest_storage->getStorageID());
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
oss.exceptions(std::ios::failbit);
diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
index 9dc3b2f8434e..43ba24116526 100644
--- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
+++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
@@ -385,6 +385,73 @@ def test_export_part_with_year_transform_partition(cluster):
node.query(f"DROP TABLE IF EXISTS {iceberg}")
+def test_export_part_partition_column_type_mismatch_is_rejected(cluster):
+ """
+ EXPORT PART must synchronously reject (BAD_ARGUMENTS) when a partition
+ column has different (even castable) types on source and destination.
+
+ The partition path / Iceberg partition values are computed from the
+ SOURCE part's minmax block in ExportPartTask::executeStep before
+ addExportConvertingActions casts the data, so a castable mismatch on
+ a partition column would let the path/value computation and the row
+ data disagree. The check in
+ ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch rejects this
+ at ALTER time.
+
+ Failing case: source `year Int32` -> destination `year Int64`, both
+ PARTITION BY year (identity transform). Both Int32 and Int64 are valid
+ Iceberg primitives, so verifyIcebergPartitionCompatibility happily
+ accepts the spec — only the exact-type check rejects.
+ """
+ node = cluster.instances["node1"]
+ sfx = unique_suffix()
+ mt = f"mt_pcol_type_mismatch_{sfx}"
+ iceberg = f"iceberg_pcol_type_mismatch_{sfx}"
+
+ make_mt(node, mt, "id Int32, year Int32", "year")
+ make_iceberg_s3(node, iceberg, "id Int32, year Int64", "year")
+
+ node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020), (3, 2020)")
+
+ part_2020 = get_part(node, mt, "2020")
+
+ error = node.query_and_get_error(
+ f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} "
+ f"SETTINGS allow_experimental_export_merge_tree_part = 1, "
+ f"allow_experimental_insert_into_iceberg = 1"
+ )
+ assert "BAD_ARGUMENTS" in error, (
+ f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
+ f"got: {error!r}"
+ )
+ assert "partition key column 'year'" in error, (
+ f"Expected the error message to identify the offending partition "
+ f"column 'year', got: {error!r}"
+ )
+
+ # No async work scheduled: no ExportPart row in system.part_log.
+ node.query("SYSTEM FLUSH LOGS")
+ part_log_count = node.query(
+ f"SELECT count() FROM system.part_log "
+ f"WHERE event_type = 'ExportPart' "
+ f" AND database = currentDatabase() "
+ f" AND table = '{mt}' "
+ f" AND part_name = '{part_2020}'"
+ ).strip()
+ assert part_log_count == "0", (
+ f"Expected no ExportPart entries in system.part_log after a "
+ f"synchronously-rejected export, found {part_log_count}."
+ )
+
+ count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
+ assert count == 0, (
+ f"Expected 0 rows in Iceberg table after rejected export, got {count}"
+ )
+
+ node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
+ node.query(f"DROP TABLE IF EXISTS {iceberg}")
+
+
def test_export_part_partition_key_mismatch_is_rejected(cluster):
"""
EXPORT PART must synchronously reject (BAD_ARGUMENTS) when the source
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 8b49589a3005..e6e5127bdd72 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -746,6 +746,72 @@ def test_partition_key_compatibility_check(cluster):
)
+def test_export_partition_partition_column_type_mismatch_is_rejected(cluster):
+ """
+ EXPORT PARTITION must synchronously reject (BAD_ARGUMENTS) when a
+ partition column has a different (even castable) type on the source
+ MergeTree and the destination Iceberg table.
+
+ The Iceberg destination's partition path is derived from the SOURCE
+ part's minmax block in ExportPartTask::executeStep, and the Iceberg
+ manifest's partition values come from the SOURCE-typed
+ `manifest.data_part->partition.value` Fields. Letting a castable type
+ mismatch through would mean the manifest/path computation and the row
+ data (which IS cast by addExportConvertingActions) disagree — not
+ equivalent to INSERT SELECT.
+
+ Failing case: source `year Int32` -> destination `year Int64`, both
+ PARTITION BY year (identity transform). Both are valid Iceberg primitives
+ so verifyIcebergPartitionCompatibility accepts the spec; only the new
+ exact-type check rejects.
+ """
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_pcol_type_mismatch_{uid}"
+ iceberg_table = f"iceberg_pcol_type_mismatch_{uid}"
+
+ # Source uses year Int32; destination Iceberg uses year Int64.
+ make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
+ node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020), (3, 2020)")
+
+ make_iceberg_s3(
+ node, iceberg_table, "id Int64, year Int64",
+ partition_by="year", s3_retry_attempts=3,
+ )
+
+ error = node.query_and_get_error(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}"
+ )
+ assert "BAD_ARGUMENTS" in error, (
+ f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
+ f"got: {error!r}"
+ )
+ assert "partition key column 'year'" in error, (
+ f"Expected the error message to identify the offending partition "
+ f"column 'year', got: {error!r}"
+ )
+
+ # No async work scheduled.
+ rows_in_system_view = node.query(
+ f"SELECT count() FROM system.replicated_partition_exports "
+ f"WHERE source_table = '{mt_table}' "
+ f" AND destination_table = '{iceberg_table}' "
+ f" AND partition_id = '2020'"
+ ).strip()
+ assert rows_in_system_view == "0", (
+ f"Expected no row in system.replicated_partition_exports after a "
+ f"synchronously-rejected export, got {rows_in_system_view}."
+ )
+
+ # Destination must remain empty.
+ count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+ assert count == 0, (
+ f"Expected 0 rows in Iceberg destination after rejected export, "
+ f"got {count}"
+ )
+
+
def test_export_ttl(cluster):
"""
After a manifest TTL expires the same partition can be re-exported, and the
diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
index 3161e3b67100..5c3e656107bc 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
@@ -1532,6 +1532,119 @@ def test_export_partition_all(cluster):
assert row_count == 3, f"Expected 3 rows in S3 after EXPORT PARTITION ALL, got {row_count}"
+def test_export_partition_partition_column_castable_type_mismatch(cluster):
+ """
+ Post-fix contract: EXPORT PARTITION must reject castable but non-equal
+ partition-column types synchronously.
+
+ Setup
+ -----
+ * Source (ReplicatedMergeTree): ``id UInt64, year String``,
+ ``PARTITION BY year``
+ * Destination (S3, partition_strategy='hive'): ``id UInt64,
+ year UInt16``, ``PARTITION BY year``
+
+ Why the synchronous rejection is required
+ -----------------------------------------
+ ``makeConvertingActions(Position)`` accepts ``String -> UInt16`` and the
+ PARTITION BY ASTs are textually identical, so the schema/AST validators
+ would let this through. But EXPORT does not re-evaluate the partition
+ key against the destination's schema: in ``ExportPartTask::executeStep``
+ the block fed to ``destination_storage->import`` is built from the
+ source part's minmax index (SOURCE types). If we let the export proceed
+ the hive partition path is computed from the un-cast SOURCE value while
+ the row data is the CAST value — silent divergence from INSERT SELECT.
+ The exact-type partition-column check in
+ ``ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch`` rejects
+ this pairing at ALTER time.
+
+ Assertions
+ ----------
+ * ``ALTER ... EXPORT PARTITION ID '' TO TABLE ...`` raises a
+ ``BAD_ARGUMENTS`` exception whose text references the partition
+ column type mismatch.
+ * No row is created in ``system.replicated_partition_exports`` for
+ this (source_table, destination_table, partition_id) — nothing
+ was ever scheduled.
+ * No parquet file is written under any ``year=*/`` prefix in S3.
+ """
+ skip_if_remote_database_disk_enabled(cluster)
+ node = cluster.instances["replica1"]
+
+ postfix = str(uuid.uuid4()).replace("-", "_")
+ mt_table = f"pkey_cast_mismatch_partition_mt_{postfix}"
+ s3_table = f"pkey_cast_mismatch_partition_s3_{postfix}"
+
+ # Source: year String; destination: year UInt16. PARTITION BY year on
+ # both sides — same AST text — to defeat the AST equivalence check.
+ node.query(
+ f"CREATE TABLE {mt_table} (id UInt64, year String) "
+ f"ENGINE = ReplicatedMergeTree('/clickhouse/tables/{mt_table}', 'replica1') "
+ f"PARTITION BY year "
+ f"ORDER BY tuple()"
+ )
+ node.query(
+ f"CREATE TABLE {s3_table} (id UInt64, year UInt16) "
+ f"ENGINE = S3(s3_conn, filename='{s3_table}', "
+ f"format=Parquet, partition_strategy='hive') "
+ f"PARTITION BY year"
+ )
+
+ node.query(
+ f"INSERT INTO {mt_table} VALUES (1, '2020'), (2, '2020'), (3, '2020')"
+ )
+
+ # With a String partition column the partition_id is the SipHash of the
+ # value rather than the textual representation — look it up so we can
+ # reference the partition explicitly in EXPORT PARTITION ID and in
+ # subsequent system.replicated_partition_exports queries.
+ partition_id = node.query(
+ f"SELECT partition_id FROM system.parts "
+ f"WHERE database = currentDatabase() AND table = '{mt_table}' "
+ f" AND active "
+ f"ORDER BY name LIMIT 1"
+ ).strip()
+ assert partition_id, (
+ "Expected one active part on the source table after INSERT; "
+ "system.parts returned nothing."
+ )
+
+ error = node.query_and_get_error(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{partition_id}' "
+ f"TO TABLE {s3_table}"
+ )
+ assert "BAD_ARGUMENTS" in error, (
+ f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
+ f"got: {error!r}"
+ )
+ assert "partition key column 'year'" in error, (
+ f"Expected the error message to identify the offending partition "
+ f"column 'year', got: {error!r}"
+ )
+
+ # Nothing scheduled: no row in system.replicated_partition_exports.
+ rows_in_system_view = node.query(
+ f"SELECT count() FROM system.replicated_partition_exports "
+ f"WHERE source_table = '{mt_table}' "
+ f" AND destination_table = '{s3_table}' "
+ f" AND partition_id = '{partition_id}'"
+ ).strip()
+ assert rows_in_system_view == "0", (
+ f"Expected no row in system.replicated_partition_exports after a "
+ f"synchronously-rejected export, got {rows_in_system_view}."
+ )
+
+ # Nothing written: no parquet file under any year=*/ partition prefix.
+ files_in_s3 = node.query(
+ f"SELECT count() FROM s3(s3_conn, "
+ f"filename='{s3_table}/year=*/*.parquet', format='One')"
+ ).strip()
+ assert files_in_s3 == "0", (
+ f"Expected no Parquet files in S3 after a synchronously-rejected "
+ f"export, found {files_in_s3}."
+ )
+
+
def test_export_partition_all_failure_modes(cluster):
"""Cover the three values of `export_merge_tree_partition_all_on_error`.
diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
index ceb83191f93a..9357f4e5aa78 100644
--- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
@@ -1,6 +1,6 @@
-- Tags: no-parallel, no-fasttest
-DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table;
+DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3;
SET allow_experimental_export_merge_tree_part=1;
@@ -39,4 +39,16 @@ INSERT INTO 03572_ephemeral_mt_table (id, year, name) VALUES (1, 2020, 'alice');
ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError NUMBER_OF_COLUMNS_DOESNT_MATCH}
-DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table;
+-- Castable but non-equal partition-column types must be rejected synchronously.
+-- The partition path is computed from the source part's minmax block (SOURCE types)
+-- BEFORE row data is CAST to destination types — letting String -> UInt16 through
+-- would cause silent divergence between partition path and row values.
+CREATE TABLE 03572_partition_type_mismatch_mt (id UInt64, year String) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+-- Partition-column type mismatch is checked synchronously BEFORE the part lookup,
+-- so we don't need to insert data nor use a real part name to provoke it.
+ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_partition_type_mismatch_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
+
+DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3;
diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
index 0f6647861e43..874b5f288cf7 100644
--- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
@@ -1,6 +1,6 @@
-- Tags: no-parallel, no-fasttest
-DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table;
+DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3;
CREATE TABLE 03572_rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt/03572_rmt_table', 'replica1') PARTITION BY year ORDER BY tuple();
@@ -21,4 +21,14 @@ CREATE TABLE 03572_invalid_schema_table (id UInt64, year UInt16) ENGINE = S3(s3_
ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NOT_IMPLEMENTED}
-DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table;
+-- Castable but non-equal partition-column types must be rejected synchronously.
+-- The partition path is computed from the source part's minmax block (SOURCE types)
+-- BEFORE row data is CAST to destination types — letting String -> UInt16 through
+-- would cause silent divergence between partition path and row values.
+CREATE TABLE 03572_rmt_partition_type_mismatch_mt (id UInt64, year String) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_pcol_type/03572_rmt_partition_type_mismatch_mt', 'replica1') PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_rmt_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_partition_type_mismatch_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
+
+DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3;
From 67489f2e7f84ca9e2a124937c27859429a0aa2e6 Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Mon, 8 Jun 2026 10:26:33 -0300
Subject: [PATCH 05/33] fix build and tests
---
.../MergeTree/ExportPartitionUtils.cpp | 1 +
.../test.py | 264 +++++++++++++++++-
2 files changed, 264 insertions(+), 1 deletion(-)
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp
index d5df9b6bea2a..f227d318a1fd 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.cpp
+++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp
@@ -10,6 +10,7 @@
#include
#include
#include
+#include
#if USE_AVRO
#include
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index e6e5127bdd72..45aa565b3586 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -781,7 +781,8 @@ def test_export_partition_partition_column_type_mismatch_is_rejected(cluster):
)
error = node.query_and_get_error(
- f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}"
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+ settings={"allow_insert_into_iceberg": 1},
)
assert "BAD_ARGUMENTS" in error, (
f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
@@ -1050,3 +1051,264 @@ def test_export_partition_writes_column_statistics(cluster):
entries = fetch_manifest_entries(node, query_id)
assert_exported_stats(entries)
+
+
+# ---------------------------------------------------------------------------
+# Schema compatibility (INSERT SELECT mirror)
+#
+# EXPORT PARTITION builds an ActionsDAG via makeConvertingActions(Position) — the
+# same primitive that powers INSERT INTO dest SELECT * FROM src. The destination
+# compatibility validation in StorageReplicatedMergeTree::exportPartitionToTable
+# mirrors what the plain-MergeTree EXPORT PART path does in MergeTreeData, so the
+# tests below pin the same contract for the replicated partition export:
+#
+# * Column counts must match positionally; otherwise the ALTER is rejected
+# synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing is scheduled.
+# * Destination column names need not match source names — positional pairs
+# are CAST element-wise.
+# * Castable type differences are accepted at validation time (widening and
+# even lossy narrowing). Lossiness only matters at runtime: if the value
+# does not fit the destination type, the async export worker fails and the
+# export is marked FAILED in system.replicated_partition_exports; the Iceberg
+# table is left untouched.
+# ---------------------------------------------------------------------------
+
+
+def test_export_partition_column_count_mismatch_source_more_is_rejected(cluster):
+ """
+ Source has 3 columns (id, year, extra), destination has 2 (id, year).
+ The ALTER must be rejected synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH,
+ nothing must be scheduled in system.replicated_partition_exports, and the
+ Iceberg table must remain empty.
+ """
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_count_more_{uid}"
+ iceberg_table = f"iceberg_count_more_{uid}"
+
+ make_rmt(node, mt_table, "id Int64, year Int32, extra String", "year",
+ replica_name="replica1")
+ node.query(f"INSERT INTO {mt_table} VALUES (1, 2020, 'foo'), (2, 2020, 'bar')")
+
+ make_iceberg_s3(node, iceberg_table, "id Int64, year Int32", partition_by="year")
+
+ error = node.query_and_get_error(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+ settings={"allow_insert_into_iceberg": 1},
+ )
+ assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, (
+ f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source>dest column count, "
+ f"got: {error!r}"
+ )
+
+ rows_in_system_view = node.query(
+ f"SELECT count() FROM system.replicated_partition_exports "
+ f"WHERE source_table = '{mt_table}' "
+ f" AND destination_table = '{iceberg_table}' "
+ f" AND partition_id = '2020'"
+ ).strip()
+ assert rows_in_system_view == "0", (
+ f"Expected no row in system.replicated_partition_exports after a "
+ f"synchronously-rejected export, got {rows_in_system_view}."
+ )
+
+ count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+ assert count == 0, (
+ f"Expected 0 rows in Iceberg table after rejected export, got {count}"
+ )
+
+
+def test_export_partition_column_count_mismatch_source_fewer_is_rejected(cluster):
+ """
+ Source has 2 columns (id, year), destination has 3 (id, year, extra).
+ Same expected synchronous rejection as the source>dest case.
+ """
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_count_fewer_{uid}"
+ iceberg_table = f"iceberg_count_fewer_{uid}"
+
+ make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
+ node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020)")
+
+ make_iceberg_s3(node, iceberg_table, "id Int64, year Int32, extra String",
+ partition_by="year")
+
+ error = node.query_and_get_error(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+ settings={"allow_insert_into_iceberg": 1},
+ )
+ assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, (
+ f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source Int32 is a
+ defined conversion) so the ALTER returns synchronously; the actual cast
+ runs in the async export worker and must fail there. The failure exhausts
+ the retry budget and the export is marked FAILED in
+ system.replicated_partition_exports with a non-zero exception_count, while
+ the Iceberg table is left empty.
+
+ (Integer narrowing overflow is not used here because the internal cast
+ `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps
+ on overflow rather than throwing — that would still produce a successful
+ export with garbage data, not a propagated failure.)
+ """
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_runtime_cast_fail_{uid}"
+ iceberg_table = f"iceberg_runtime_cast_fail_{uid}"
+
+ make_rmt(node, mt_table, "id String, year Int32", "year", replica_name="replica1")
+ node.query(f"INSERT INTO {mt_table} VALUES ('not a number', 2020)")
+
+ make_iceberg_s3(node, iceberg_table, "id Int32, year Int32", partition_by="year")
+
+ node.query(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} "
+ f"SETTINGS export_merge_tree_partition_max_retries = 1, allow_insert_into_iceberg = 1"
+ )
+
+ wait_for_export_status(node, mt_table, iceberg_table, "2020", "FAILED", timeout=60)
+
+ exception_count = int(node.query(
+ f"SELECT any(exception_count) FROM system.replicated_partition_exports "
+ f"WHERE source_table = '{mt_table}' "
+ f" AND destination_table = '{iceberg_table}' "
+ f" AND partition_id = '2020'"
+ ).strip())
+ assert exception_count > 0, (
+ "Expected non-zero exception_count after a failed runtime cast"
+ )
+
+ count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+ assert count == 0, (
+ f"Expected 0 rows in Iceberg table after failed export, got {count}"
+ )
From 245c0bb5a3c887b5d642d298435325be16aa8cfa Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Mon, 15 Jun 2026 14:20:41 -0300
Subject: [PATCH 06/33] yet another impl
---
src/Core/Settings.cpp | 3 +
src/Core/SettingsChangesHistory.cpp | 1 +
.../MergeTree/ExportPartitionUtils.cpp | 89 +++++++++----------
src/Storages/MergeTree/ExportPartitionUtils.h | 40 +++------
src/Storages/MergeTree/MergeTreeData.cpp | 41 ++-------
src/Storages/StorageReplicatedMergeTree.cpp | 43 ++-------
.../test.py | 61 +++----------
.../test.py | 59 +++---------
.../test.py | 45 ++--------
...rge_tree_part_to_object_storage_simple.sql | 35 ++++++--
...rge_tree_part_to_object_storage_simple.sql | 33 +++++--
11 files changed, 157 insertions(+), 293 deletions(-)
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index ba022db0f83a..6368ec01c1cb 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -7605,6 +7605,9 @@ Has no effect on `EXPORT PARTITION ` (single-partition export).
)", 0) \
DECLARE(String, export_merge_tree_part_filename_pattern, "{part_name}_{checksum}", R"(
Pattern for the filename of the exported merge tree part. The `part_name` and `checksum` are calculated and replaced on the fly. Additional macros are supported.
+)", 0) \
+ DECLARE(Bool, export_merge_tree_part_allow_lossy_cast, false, R"(
+Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
)", 0) \
\
/* ####################################################### */ \
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 50cc8c572561..3a72ec667415 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -43,6 +43,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
{
{"object_storage_cluster_join_mode", "allow", "allow", "New setting"},
{"export_merge_tree_partition_task_timeout_seconds", "3600", "86400", "Increase default value to make it more realistic"},
+ {"export_merge_tree_part_allow_lossy_cast", false, false, "New setting to gate lossy casts in EXPORT PART/PARTITION behind explicit acknowledgment"},
});
addSettingsChanges(settings_changes_history, "26.3",
{
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp
index f227d318a1fd..ef51b10b9c0a 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.cpp
+++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp
@@ -9,6 +9,10 @@
#include
#include
#include
+#include
+#include
+#include
+#include
#include
#include
@@ -38,6 +42,11 @@ namespace ErrorCodes
extern const int NETWORK_ERROR;
}
+namespace Setting
+{
+ extern const SettingsBool export_merge_tree_part_allow_lossy_cast;
+}
+
namespace FailPoints
{
extern const char iceberg_export_after_commit_before_zk_completed[];
@@ -404,18 +413,8 @@ namespace ExportPartitionUtils
#if USE_AVRO
void verifyIcebergPartitionCompatibility(
const Poco::JSON::Object::Ptr & metadata_object,
- const ASTPtr & partition_key_ast,
- const StorageMetadataPtr & source_metadata,
- const StorageMetadataPtr & destination_metadata,
- const StorageID & destination_storage_id)
+ const ASTPtr & partition_key_ast)
{
- /// The Iceberg manifest's partition values are derived from the SOURCE
- /// part's minmax block (see `IcebergMetadata::commitExportPartitionTransaction`)
- /// while the row data is CAST to the destination Iceberg schema by
- /// `addExportConvertingActions`. Reject castable type differences on
- /// partition-key columns up front so the two cannot disagree.
- verifyPartitionColumnsExactTypeMatch(source_metadata, destination_metadata, destination_storage_id);
-
const auto original_schema_id = metadata_object->getValue(Iceberg::f_current_schema_id);
const auto partition_spec_id = metadata_object->getValue(Iceberg::f_default_spec_id);
@@ -531,51 +530,47 @@ namespace ExportPartitionUtils
}
#endif
- void verifyPartitionColumnsExactTypeMatch(
+ void verifyExportSchemaCastable(
const StorageMetadataPtr & source_metadata,
const StorageMetadataPtr & destination_metadata,
- const StorageID & destination_storage_id)
+ const StorageID & destination_storage_id,
+ const ContextPtr & context)
{
- if (!source_metadata->hasPartitionKey())
- return;
+ /// Build (and discard) the same converting DAG the export worker will build
+ /// later, to surface structural mismatches (column count, untyped casts) early.
+ Block source_sample_block;
+ for (const auto & column : source_metadata->getColumns().getReadable())
+ source_sample_block.insert({column.type->createColumn(), column.type, column.name});
+
+ const auto destination_sample_block = destination_metadata->getSampleBlockNonMaterialized();
- /// Use the partition expression's required columns (the ones that flow
- /// into the destination's partition strategy via
- /// `minmax_idx->getBlock(storage)`), not the partition expression's
- /// sample-block output columns. This catches both identity
- /// partitioning (`PARTITION BY year`) and computed partitioning
- /// (`PARTITION BY toYearNumSinceEpoch(event_date)` — required column
- /// is `event_date`).
- const auto required_columns
- = source_metadata->getPartitionKey().expression->getRequiredColumnsWithTypes();
+ const auto source_columns = source_sample_block.getColumnsWithTypeAndName();
+ const auto destination_columns = destination_sample_block.getColumnsWithTypeAndName();
- const auto & destination_columns = destination_metadata->getColumns();
+ (void) ActionsDAG::makeConvertingActions(
+ source_columns,
+ destination_columns,
+ ActionsDAG::MatchColumnsMode::Position,
+ context);
- for (const auto & source_column : required_columns)
+ /// Lossy casts may silently change values, so reject them unless the user opts in.
+ if (context->getSettingsRef()[Setting::export_merge_tree_part_allow_lossy_cast])
+ return;
+
+ const size_t num_columns = std::min(source_columns.size(), destination_columns.size());
+ for (size_t i = 0; i < num_columns; ++i)
{
- if (!destination_columns.has(source_column.name))
- throw Exception(ErrorCodes::BAD_ARGUMENTS,
- "Cannot export partition: source partition key references column '{}' "
- "which does not exist on the destination table {}. "
- "EXPORT does not re-evaluate the partition key against the destination's schema, "
- "so every column the source partition expression depends on must be present "
- "on the destination with the same type.",
- source_column.name, destination_storage_id.getFullTableName());
-
- const auto destination_type
- = destination_columns.getPhysical(source_column.name).type;
- if (!source_column.type->equals(*destination_type))
+ const auto & source_column = source_columns[i];
+ const auto & destination_column = destination_columns[i];
+ if (!canBeSafelyCast(source_column.type, destination_column.type))
throw Exception(ErrorCodes::BAD_ARGUMENTS,
- "Cannot export partition: partition key column '{}' has type {} on the source "
- "but {} on the destination table {}. "
- "EXPORT does not re-evaluate the partition key against the destination's schema, "
- "so castable type differences are not allowed for partition columns "
- "(the partition path and any Iceberg partition values are derived from the "
- "source-typed minmax block, while the row data is CAST to the destination type).",
- source_column.name,
+ "Cannot export to {}: column '{}' requires a lossy cast from {} to {}, "
+ "which may change values. Set `export_merge_tree_part_allow_lossy_cast = 1` "
+ "to allow lossy casts during export.",
+ destination_storage_id.getFullTableName(),
+ destination_column.name,
source_column.type->getName(),
- destination_type->getName(),
- destination_storage_id.getFullTableName());
+ destination_column.type->getName());
}
}
}
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h
index 958a8a819457..d97bb538a47f 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.h
+++ b/src/Storages/MergeTree/ExportPartitionUtils.h
@@ -89,41 +89,23 @@ namespace ExportPartitionUtils
const std::string & exception_message,
const LoggerPtr & log);
- /// EXPORT PART/PARTITION never re-evaluates the partition key against the
- /// destination's schema: the partition path (object storage) and the
- /// Iceberg partition values (data lake) are derived from the SOURCE part's
- /// minmax block in ExportPartTask::executeStep. Therefore every column the
- /// SOURCE partition expression depends on must have an identically-typed
- /// counterpart on the destination — castable but non-equal types are not
- /// safe here even though `makeConvertingActions` would accept them.
- ///
- /// Called directly from the non-data-lake branch of
- /// `MergeTreeData::exportPartToTable` /
- /// `StorageReplicatedMergeTree::exportPartitionToTable`, and indirectly
- /// for the Iceberg branch via `verifyIcebergPartitionCompatibility` which
- /// embeds this check. Throws BAD_ARGUMENTS on missing column or type
- /// mismatch.
- void verifyPartitionColumnsExactTypeMatch(
+ /// Validates that source columns can be exported into the destination with the
+ /// same positional CAST matching as `INSERT INTO dest SELECT * FROM src`. Lossy
+ /// casts are rejected unless `export_merge_tree_part_allow_lossy_cast` is set.
+ /// Throws BAD_ARGUMENTS on any violation.
+ void verifyExportSchemaCastable(
const StorageMetadataPtr & source_metadata,
const StorageMetadataPtr & destination_metadata,
- const StorageID & destination_storage_id);
+ const StorageID & destination_storage_id,
+ const ContextPtr & context);
#if USE_AVRO
- /// Verifies that the source MergeTree partition key is compatible with the
- /// destination Iceberg partition spec by:
- /// * Comparing partition-field source-ids and transforms in order
- /// against the Iceberg spec derived from the MergeTree PARTITION BY AST.
- /// * Embedding `verifyPartitionColumnsExactTypeMatch` so that columns
- /// flowing through the partition key have identical ClickHouse types on
- /// both sides — the Iceberg manifest's partition values come from the
- /// SOURCE-typed minmax block and cannot be reconciled later.
- /// Throws BAD_ARGUMENTS on any mismatch.
+ /// Verifies the source MergeTree partition key matches the destination Iceberg
+ /// partition spec (source-ids and transforms in order). Throws BAD_ARGUMENTS on
+ /// mismatch.
void verifyIcebergPartitionCompatibility(
const Poco::JSON::Object::Ptr & metadata_object,
- const ASTPtr & partition_key_ast,
- const StorageMetadataPtr & source_metadata,
- const StorageMetadataPtr & destination_metadata,
- const StorageID & destination_storage_id);
+ const ASTPtr & partition_key_ast);
#endif
}
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index ee2b7d2250c8..937c675fab2f 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -6745,10 +6745,7 @@ void MergeTreeData::exportPartToTable(
ExportPartitionUtils::verifyIcebergPartitionCompatibility(
metadata_object,
- source_metadata_ptr->getPartitionKeyAST(),
- source_metadata_ptr,
- destination_metadata_ptr,
- dest_storage->getStorageID());
+ source_metadata_ptr->getPartitionKeyAST());
}
#else
(void)iceberg_metadata_json_;
@@ -6756,42 +6753,16 @@ void MergeTreeData::exportPartToTable(
#endif
}
- /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` —
- /// positional matching with CAST. We surface structural mismatches synchronously at
- /// ALTER TABLE time by building (and discarding) the same converting DAG the worker
- /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch,
- /// untyped cast) fails here; anything it accepts will be CAST at runtime.
- {
- Block source_sample_block;
- for (const auto & column : source_metadata_ptr->getColumns().getReadable())
- source_sample_block.insert({column.type->createColumn(), column.type, column.name});
-
- const auto destination_sample_block = destination_metadata_ptr->getSampleBlockNonMaterialized();
+ /// Positional CAST matching, like `INSERT INTO dest SELECT * FROM src`.
+ ExportPartitionUtils::verifyExportSchemaCastable(
+ source_metadata_ptr, destination_metadata_ptr, dest_storage->getStorageID(), query_context);
- (void) ActionsDAG::makeConvertingActions(
- source_sample_block.getColumnsWithTypeAndName(),
- destination_sample_block.getColumnsWithTypeAndName(),
- ActionsDAG::MatchColumnsMode::Position,
- query_context);
- }
-
- /// For data lakes (Iceberg) the partition-key checks — both spec/transform
- /// compatibility and ClickHouse partition-column exact-type equality — are
- /// performed inside `verifyIcebergPartitionCompatibility` above.
- /// For non-data-lake destinations we do the AST equivalence check here and
- /// then run the partition-column exact-type check directly.
+ /// Iceberg partition compatibility is checked above; here we only need the
+ /// partition-key ASTs to match (partition-column types follow the lossy-cast gate).
if (!dest_storage->isDataLake())
{
if (query_to_string(source_metadata_ptr->getPartitionKeyAST()) != query_to_string(destination_metadata_ptr->getPartitionKeyAST()))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key");
-
- /// EXPORT does not re-evaluate the partition key against the destination's schema:
- /// the hive partition path is derived from the SOURCE part's minmax block in
- /// `ExportPartTask::executeStep`, while the row data is CAST to destination types
- /// by `addExportConvertingActions`. Reject castable type differences on
- /// partition-key columns synchronously so the path and the row data cannot disagree.
- ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch(
- source_metadata_ptr, destination_metadata_ptr, dest_storage->getStorageID());
}
auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated});
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index cb576f137546..2301ec1c1a30 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -8388,42 +8388,16 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
auto src_snapshot = getInMemoryMetadataPtr();
auto destination_snapshot = dest_storage->getInMemoryMetadataPtr();
- /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` —
- /// positional matching with CAST. We surface structural mismatches synchronously at
- /// ALTER TABLE time by building (and discarding) the same converting DAG the worker
- /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch,
- /// untyped cast) fails here; anything it accepts will be CAST at runtime.
- {
- Block source_sample_block;
- for (const auto & column : src_snapshot->getColumns().getReadable())
- source_sample_block.insert({column.type->createColumn(), column.type, column.name});
-
- const auto destination_sample_block = destination_snapshot->getSampleBlockNonMaterialized();
-
- (void) ActionsDAG::makeConvertingActions(
- source_sample_block.getColumnsWithTypeAndName(),
- destination_sample_block.getColumnsWithTypeAndName(),
- ActionsDAG::MatchColumnsMode::Position,
- query_context);
- }
-
- /// For data lakes (Iceberg) the partition-key checks — both spec/transform
- /// compatibility and ClickHouse partition-column exact-type equality — are
- /// performed inside `verifyIcebergPartitionCompatibility` below (per partition).
- /// For non-data-lake destinations we do the AST equivalence check here and
- /// then run the partition-column exact-type check directly.
+ /// Positional CAST matching, like `INSERT INTO dest SELECT * FROM src`.
+ ExportPartitionUtils::verifyExportSchemaCastable(
+ src_snapshot, destination_snapshot, dest_storage->getStorageID(), query_context);
+
+ /// Iceberg partition compatibility is checked below; here we only need the
+ /// partition-key ASTs to match (partition-column types follow the lossy-cast gate).
if (!dest_storage->isDataLake())
{
if (query_to_string(src_snapshot->getPartitionKeyAST()) != query_to_string(destination_snapshot->getPartitionKeyAST()))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key");
-
- /// EXPORT does not re-evaluate the partition key against the destination's schema:
- /// the hive partition path is derived from the SOURCE part's minmax block in
- /// `ExportPartTask::executeStep`, while the row data is CAST to destination types
- /// by `addExportConvertingActions`. Reject castable type differences on
- /// partition-key columns synchronously so the path and the row data cannot disagree.
- ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch(
- src_snapshot, destination_snapshot, dest_storage->getStorageID());
}
zkutil::ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly();
@@ -8597,10 +8571,7 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
ExportPartitionUtils::verifyIcebergPartitionCompatibility(
metadata_object,
- src_snapshot->getPartitionKeyAST(),
- src_snapshot,
- destination_snapshot,
- dest_storage->getStorageID());
+ src_snapshot->getPartitionKeyAST());
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
oss.exceptions(std::ios::failbit);
diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
index 43ba24116526..f51dfa3c602e 100644
--- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
+++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
@@ -385,28 +385,12 @@ def test_export_part_with_year_transform_partition(cluster):
node.query(f"DROP TABLE IF EXISTS {iceberg}")
-def test_export_part_partition_column_type_mismatch_is_rejected(cluster):
- """
- EXPORT PART must synchronously reject (BAD_ARGUMENTS) when a partition
- column has different (even castable) types on source and destination.
-
- The partition path / Iceberg partition values are computed from the
- SOURCE part's minmax block in ExportPartTask::executeStep before
- addExportConvertingActions casts the data, so a castable mismatch on
- a partition column would let the path/value computation and the row
- data disagree. The check in
- ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch rejects this
- at ALTER time.
-
- Failing case: source `year Int32` -> destination `year Int64`, both
- PARTITION BY year (identity transform). Both Int32 and Int64 are valid
- Iceberg primitives, so verifyIcebergPartitionCompatibility happily
- accepts the spec — only the exact-type check rejects.
- """
+def test_export_part_partition_column_lossless_widening(cluster):
+ """A lossless widening of a partition column (year Int32 -> Int64) round-trips."""
node = cluster.instances["node1"]
sfx = unique_suffix()
- mt = f"mt_pcol_type_mismatch_{sfx}"
- iceberg = f"iceberg_pcol_type_mismatch_{sfx}"
+ mt = f"mt_pcol_widening_{sfx}"
+ iceberg = f"iceberg_pcol_widening_{sfx}"
make_mt(node, mt, "id Int32, year Int32", "year")
make_iceberg_s3(node, iceberg, "id Int32, year Int64", "year")
@@ -414,39 +398,20 @@ def test_export_part_partition_column_type_mismatch_is_rejected(cluster):
node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020), (3, 2020)")
part_2020 = get_part(node, mt, "2020")
+ export_part(node, mt, part_2020, iceberg)
+ wait_for_export_part(node, mt, part_2020)
- error = node.query_and_get_error(
- f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} "
- f"SETTINGS allow_experimental_export_merge_tree_part = 1, "
- f"allow_experimental_insert_into_iceberg = 1"
- )
- assert "BAD_ARGUMENTS" in error, (
- f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
- f"got: {error!r}"
- )
- assert "partition key column 'year'" in error, (
- f"Expected the error message to identify the offending partition "
- f"column 'year', got: {error!r}"
- )
+ count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
+ assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}"
- # No async work scheduled: no ExportPart row in system.part_log.
- node.query("SYSTEM FLUSH LOGS")
- part_log_count = node.query(
- f"SELECT count() FROM system.part_log "
- f"WHERE event_type = 'ExportPart' "
- f" AND database = currentDatabase() "
- f" AND table = '{mt}' "
- f" AND part_name = '{part_2020}'"
+ result = node.query(
+ f"SELECT id, toTypeName(year), year FROM {iceberg} ORDER BY id"
).strip()
- assert part_log_count == "0", (
- f"Expected no ExportPart entries in system.part_log after a "
- f"synchronously-rejected export, found {part_log_count}."
+ assert result == "1\tInt64\t2020\n2\tInt64\t2020\n3\tInt64\t2020", (
+ f"Unexpected widened partition-column data:\n{result}"
)
- count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
- assert count == 0, (
- f"Expected 0 rows in Iceberg table after rejected export, got {count}"
- )
+ assert_part_log(node, mt, part_2020)
node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
node.query(f"DROP TABLE IF EXISTS {iceberg}")
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 45aa565b3586..6bd080bc72f0 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -746,30 +746,13 @@ def test_partition_key_compatibility_check(cluster):
)
-def test_export_partition_partition_column_type_mismatch_is_rejected(cluster):
- """
- EXPORT PARTITION must synchronously reject (BAD_ARGUMENTS) when a
- partition column has a different (even castable) type on the source
- MergeTree and the destination Iceberg table.
-
- The Iceberg destination's partition path is derived from the SOURCE
- part's minmax block in ExportPartTask::executeStep, and the Iceberg
- manifest's partition values come from the SOURCE-typed
- `manifest.data_part->partition.value` Fields. Letting a castable type
- mismatch through would mean the manifest/path computation and the row
- data (which IS cast by addExportConvertingActions) disagree — not
- equivalent to INSERT SELECT.
-
- Failing case: source `year Int32` -> destination `year Int64`, both
- PARTITION BY year (identity transform). Both are valid Iceberg primitives
- so verifyIcebergPartitionCompatibility accepts the spec; only the new
- exact-type check rejects.
- """
+def test_export_partition_partition_column_lossless_widening(cluster):
+ """A lossless widening of a partition column (year Int32 -> Int64) round-trips."""
node = cluster.instances["replica1"]
uid = unique_suffix()
- mt_table = f"mt_pcol_type_mismatch_{uid}"
- iceberg_table = f"iceberg_pcol_type_mismatch_{uid}"
+ mt_table = f"mt_pcol_widening_{uid}"
+ iceberg_table = f"iceberg_pcol_widening_{uid}"
# Source uses year Int32; destination Iceberg uses year Int64.
make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
@@ -780,36 +763,20 @@ def test_export_partition_partition_column_type_mismatch_is_rejected(cluster):
partition_by="year", s3_retry_attempts=3,
)
- error = node.query_and_get_error(
+ node.query(
f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
settings={"allow_insert_into_iceberg": 1},
)
- assert "BAD_ARGUMENTS" in error, (
- f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
- f"got: {error!r}"
- )
- assert "partition key column 'year'" in error, (
- f"Expected the error message to identify the offending partition "
- f"column 'year', got: {error!r}"
- )
-
- # No async work scheduled.
- rows_in_system_view = node.query(
- f"SELECT count() FROM system.replicated_partition_exports "
- f"WHERE source_table = '{mt_table}' "
- f" AND destination_table = '{iceberg_table}' "
- f" AND partition_id = '2020'"
- ).strip()
- assert rows_in_system_view == "0", (
- f"Expected no row in system.replicated_partition_exports after a "
- f"synchronously-rejected export, got {rows_in_system_view}."
- )
+ wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
- # Destination must remain empty.
count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
- assert count == 0, (
- f"Expected 0 rows in Iceberg destination after rejected export, "
- f"got {count}"
+ assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}"
+
+ result = node.query(
+ f"SELECT id, toTypeName(year), year FROM {iceberg_table} ORDER BY id"
+ ).strip()
+ assert result == "1\tInt64\t2020\n2\tInt64\t2020\n3\tInt64\t2020", (
+ f"Unexpected widened partition-column data:\n{result}"
)
diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
index 5c3e656107bc..d403538999df 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
@@ -1533,41 +1533,8 @@ def test_export_partition_all(cluster):
def test_export_partition_partition_column_castable_type_mismatch(cluster):
- """
- Post-fix contract: EXPORT PARTITION must reject castable but non-equal
- partition-column types synchronously.
-
- Setup
- -----
- * Source (ReplicatedMergeTree): ``id UInt64, year String``,
- ``PARTITION BY year``
- * Destination (S3, partition_strategy='hive'): ``id UInt64,
- year UInt16``, ``PARTITION BY year``
-
- Why the synchronous rejection is required
- -----------------------------------------
- ``makeConvertingActions(Position)`` accepts ``String -> UInt16`` and the
- PARTITION BY ASTs are textually identical, so the schema/AST validators
- would let this through. But EXPORT does not re-evaluate the partition
- key against the destination's schema: in ``ExportPartTask::executeStep``
- the block fed to ``destination_storage->import`` is built from the
- source part's minmax index (SOURCE types). If we let the export proceed
- the hive partition path is computed from the un-cast SOURCE value while
- the row data is the CAST value — silent divergence from INSERT SELECT.
- The exact-type partition-column check in
- ``ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch`` rejects
- this pairing at ALTER time.
-
- Assertions
- ----------
- * ``ALTER ... EXPORT PARTITION ID '' TO TABLE ...`` raises a
- ``BAD_ARGUMENTS`` exception whose text references the partition
- column type mismatch.
- * No row is created in ``system.replicated_partition_exports`` for
- this (source_table, destination_table, partition_id) — nothing
- was ever scheduled.
- * No parquet file is written under any ``year=*/`` prefix in S3.
- """
+ """A lossy partition-column cast (year String -> UInt16) is rejected synchronously
+ when export_merge_tree_part_allow_lossy_cast is off, scheduling nothing."""
skip_if_remote_database_disk_enabled(cluster)
node = cluster.instances["replica1"]
@@ -1614,12 +1581,12 @@ def test_export_partition_partition_column_castable_type_mismatch(cluster):
f"TO TABLE {s3_table}"
)
assert "BAD_ARGUMENTS" in error, (
- f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
+ f"Expected BAD_ARGUMENTS for a lossy partition-column cast, "
f"got: {error!r}"
)
- assert "partition key column 'year'" in error, (
- f"Expected the error message to identify the offending partition "
- f"column 'year', got: {error!r}"
+ assert "requires a lossy cast" in error and "'year'" in error, (
+ f"Expected the error message to report the lossy cast on column "
+ f"'year', got: {error!r}"
)
# Nothing scheduled: no row in system.replicated_partition_exports.
diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
index 9357f4e5aa78..e40c5ebde461 100644
--- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
@@ -1,6 +1,6 @@
-- Tags: no-parallel, no-fasttest
-DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3;
+DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3, 03572_lossy_mt, 03572_lossy_s3, 03572_lossless_mt, 03572_lossless_s3;
SET allow_experimental_export_merge_tree_part=1;
@@ -39,16 +39,37 @@ INSERT INTO 03572_ephemeral_mt_table (id, year, name) VALUES (1, 2020, 'alice');
ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError NUMBER_OF_COLUMNS_DOESNT_MATCH}
--- Castable but non-equal partition-column types must be rejected synchronously.
--- The partition path is computed from the source part's minmax block (SOURCE types)
--- BEFORE row data is CAST to destination types — letting String -> UInt16 through
--- would cause silent divergence between partition path and row values.
+-- Partition columns follow the same lossy-cast gate as any other column (no special
+-- exact-type guard). String -> UInt16 is a lossy cast, so with the default
+-- export_merge_tree_part_allow_lossy_cast = 0 it is rejected synchronously.
CREATE TABLE 03572_partition_type_mismatch_mt (id UInt64, year String) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
CREATE TABLE 03572_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
--- Partition-column type mismatch is checked synchronously BEFORE the part lookup,
+-- The lossy-cast check fires synchronously BEFORE the part lookup,
-- so we don't need to insert data nor use a real part name to provoke it.
ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_partition_type_mismatch_s3
SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
-DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3;
+-- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values.
+-- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a
+-- fake part name is enough to provoke it.
+CREATE TABLE 03572_lossy_mt (id Int64, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
+
+-- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation
+-- passes and execution proceeds to the part lookup, which fails on the fake part name.
+ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART}
+
+-- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema
+-- validation passes and we reach the part lookup, which fails on the fake part name.
+CREATE TABLE 03572_lossless_mt (id Int32, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+ALTER TABLE 03572_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossless_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART}
+
+DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3, 03572_lossy_mt, 03572_lossy_s3, 03572_lossless_mt, 03572_lossless_s3;
diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
index 874b5f288cf7..e20fc14a26a5 100644
--- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
@@ -1,6 +1,6 @@
-- Tags: no-parallel, no-fasttest
-DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3;
+DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3, 03572_rmt_lossy_mt, 03572_rmt_lossy_s3, 03572_rmt_lossless_mt, 03572_rmt_lossless_s3;
CREATE TABLE 03572_rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt/03572_rmt_table', 'replica1') PARTITION BY year ORDER BY tuple();
@@ -21,14 +21,35 @@ CREATE TABLE 03572_invalid_schema_table (id UInt64, year UInt16) ENGINE = S3(s3_
ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NOT_IMPLEMENTED}
--- Castable but non-equal partition-column types must be rejected synchronously.
--- The partition path is computed from the source part's minmax block (SOURCE types)
--- BEFORE row data is CAST to destination types — letting String -> UInt16 through
--- would cause silent divergence between partition path and row values.
+-- Partition columns follow the same lossy-cast gate as any other column (no special
+-- exact-type guard). String -> UInt16 is a lossy cast, so with the default
+-- export_merge_tree_part_allow_lossy_cast = 0 it is rejected synchronously.
CREATE TABLE 03572_rmt_partition_type_mismatch_mt (id UInt64, year String) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_pcol_type/03572_rmt_partition_type_mismatch_mt', 'replica1') PARTITION BY year ORDER BY tuple();
CREATE TABLE 03572_rmt_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_partition_type_mismatch_s3
SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
-DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3;
+-- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values.
+-- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a
+-- fake part name is enough to provoke it.
+CREATE TABLE 03572_rmt_lossy_mt (id Int64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossy/03572_rmt_lossy_mt', 'replica1') PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_rmt_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
+
+-- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation
+-- passes and execution proceeds to the part lookup, which fails on the fake part name.
+ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART}
+
+-- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema
+-- validation passes and we reach the part lookup, which fails on the fake part name.
+CREATE TABLE 03572_rmt_lossless_mt (id Int32, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossless/03572_rmt_lossless_mt', 'replica1') PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_rmt_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+ALTER TABLE 03572_rmt_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossless_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART}
+
+DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3, 03572_rmt_lossy_mt, 03572_rmt_lossy_s3, 03572_rmt_lossless_mt, 03572_rmt_lossless_s3;
From 8d2b0862604dd7fca778d8de5a598fbe92d9ce5f Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Mon, 15 Jun 2026 14:42:27 -0300
Subject: [PATCH 07/33] fix tests and background setting
---
.../MergeTree/ExportPartitionUtils.cpp | 3 ++
.../test.py | 47 +++++++++----------
.../test.py | 41 +++++++---------
3 files changed, 41 insertions(+), 50 deletions(-)
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp
index ef51b10b9c0a..c902a4042218 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.cpp
+++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp
@@ -102,6 +102,9 @@ namespace ExportPartitionUtils
/// stalls when the setting is only set at the query level.
context_copy->setSetting("allow_insert_into_iceberg", true);
+ /// This setting, just like allow_insert_into_iceberg, has been verified at request time. At this point, we allow everything.
+ context_copy->setSetting("export_merge_tree_part_allow_lossy_cast", true);
+
return context_copy;
}
diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
index f51dfa3c602e..4a8514ef3ae1 100644
--- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
+++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
@@ -69,11 +69,15 @@ def get_part(node, table: str, partition_id: str) -> str:
).strip()
-def export_part(node, table: str, part: str, dest: str) -> None:
+def export_part(node, table: str, part: str, dest: str, extra_settings: str = "") -> None:
+ settings = (
+ "allow_experimental_export_merge_tree_part = 1, "
+ "allow_experimental_insert_into_iceberg = 1"
+ )
+ if extra_settings:
+ settings += ", " + extra_settings
node.query(
- f"ALTER TABLE {table} EXPORT PART '{part}' TO TABLE {dest} "
- f"SETTINGS allow_experimental_export_merge_tree_part = 1, "
- f"allow_experimental_insert_into_iceberg = 1"
+ f"ALTER TABLE {table} EXPORT PART '{part}' TO TABLE {dest} SETTINGS {settings}"
)
@@ -560,10 +564,8 @@ def test_export_part_writes_column_statistics(cluster):
# synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing lands.
# * Destination column names need not match source names — positional pairs
# are CAST element-wise.
-# * Castable type differences are accepted at validation time (widening and
-# even lossy narrowing). Lossiness only matters at runtime: if the value
-# does not fit the destination type, the async export worker fails and the
-# failure surfaces via system.part_log; the Iceberg table is untouched.
+# * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at
+# validation time unless export_merge_tree_part_allow_lossy_cast = 1.
# ---------------------------------------------------------------------------
@@ -701,10 +703,8 @@ def test_export_part_with_castable_widening(cluster):
def test_export_part_with_castable_narrowing_values_fit(cluster):
- """
- Source column is Int64, destination expects Int32 — lossy in general but
- not blocked, mirroring INSERT SELECT.
- """
+ """A lossy narrowing (id Int64 -> Int32) succeeds once the user opts in via
+ export_merge_tree_part_allow_lossy_cast."""
node = cluster.instances["node1"]
sfx = unique_suffix()
mt = f"mt_narrow_fit_{sfx}"
@@ -716,7 +716,7 @@ def test_export_part_with_castable_narrowing_values_fit(cluster):
node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020)")
part_2020 = get_part(node, mt, "2020")
- export_part(node, mt, part_2020, iceberg)
+ export_part(node, mt, part_2020, iceberg, "export_merge_tree_part_allow_lossy_cast = 1")
wait_for_export_part(node, mt, part_2020)
count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
@@ -736,18 +736,13 @@ def test_export_part_with_castable_narrowing_values_fit(cluster):
def test_export_part_runtime_cast_failure_propagates_async(cluster):
- """
- Source has a String column whose value cannot be parsed as the destination
- Int32 column. Validation accepts the type pair (String -> Int32 is a
- defined conversion) so the ALTER returns synchronously; the actual cast
- runs in the async export worker and must fail there. The failure is
- propagated to system.part_log (non-zero `error` column with a non-empty
- `exception` text) and the Iceberg table is left empty.
-
- (Integer narrowing overflow is not used here because the internal cast
- `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps
- on overflow rather than throwing — that would still produce a successful
- export with garbage data, not a propagated failure.)
+ """A String value that cannot be parsed as the destination Int32 passes the
+ synchronous lossy-cast gate (with export_merge_tree_part_allow_lossy_cast = 1) but
+ fails at runtime in the async worker; the failure surfaces in system.part_log and
+ Iceberg is left empty.
+
+ (Integer overflow is not used because the internal cast uses CastType::nonAccurate,
+ which wraps rather than throwing.)
"""
node = cluster.instances["node1"]
sfx = unique_suffix()
@@ -760,7 +755,7 @@ def test_export_part_runtime_cast_failure_propagates_async(cluster):
node.query(f"INSERT INTO {mt} VALUES ('not a number', 2020)")
part_2020 = get_part(node, mt, "2020")
- export_part(node, mt, part_2020, iceberg)
+ export_part(node, mt, part_2020, iceberg, "export_merge_tree_part_allow_lossy_cast = 1")
exception = wait_for_failed_export_part(node, mt, part_2020)
assert exception, (
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 6bd080bc72f0..8e3f055c4317 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -1033,11 +1033,8 @@ def test_export_partition_writes_column_statistics(cluster):
# synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing is scheduled.
# * Destination column names need not match source names — positional pairs
# are CAST element-wise.
-# * Castable type differences are accepted at validation time (widening and
-# even lossy narrowing). Lossiness only matters at runtime: if the value
-# does not fit the destination type, the async export worker fails and the
-# export is marked FAILED in system.replicated_partition_exports; the Iceberg
-# table is left untouched.
+# * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at
+# validation time unless export_merge_tree_part_allow_lossy_cast = 1.
# ---------------------------------------------------------------------------
@@ -1200,10 +1197,8 @@ def test_export_partition_with_castable_widening(cluster):
def test_export_partition_with_castable_narrowing_values_fit(cluster):
- """
- Source column is Int64, destination expects Int32 — lossy in general but
- not blocked, mirroring INSERT SELECT.
- """
+ """A lossy narrowing (id Int64 -> Int32) succeeds once the user opts in via
+ export_merge_tree_part_allow_lossy_cast."""
node = cluster.instances["replica1"]
uid = unique_suffix()
@@ -1217,7 +1212,10 @@ def test_export_partition_with_castable_narrowing_values_fit(cluster):
node.query(
f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
- settings={"allow_insert_into_iceberg": 1},
+ settings={
+ "allow_insert_into_iceberg": 1,
+ "export_merge_tree_part_allow_lossy_cast": 1,
+ },
)
wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
@@ -1233,19 +1231,13 @@ def test_export_partition_with_castable_narrowing_values_fit(cluster):
def test_export_partition_runtime_cast_failure_propagates_async(cluster):
- """
- Source has a String column whose value cannot be parsed as the destination
- Int32 column. Validation accepts the type pair (String -> Int32 is a
- defined conversion) so the ALTER returns synchronously; the actual cast
- runs in the async export worker and must fail there. The failure exhausts
- the retry budget and the export is marked FAILED in
- system.replicated_partition_exports with a non-zero exception_count, while
- the Iceberg table is left empty.
-
- (Integer narrowing overflow is not used here because the internal cast
- `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps
- on overflow rather than throwing — that would still produce a successful
- export with garbage data, not a propagated failure.)
+ """A String value that cannot be parsed as the destination Int32 passes the
+ synchronous lossy-cast gate (with export_merge_tree_part_allow_lossy_cast = 1) but
+ fails at runtime in the async worker, marking the export FAILED and leaving Iceberg
+ empty.
+
+ (Integer overflow is not used because the internal cast uses CastType::nonAccurate,
+ which wraps rather than throwing.)
"""
node = cluster.instances["replica1"]
@@ -1260,7 +1252,8 @@ def test_export_partition_runtime_cast_failure_propagates_async(cluster):
node.query(
f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} "
- f"SETTINGS export_merge_tree_partition_max_retries = 1, allow_insert_into_iceberg = 1"
+ f"SETTINGS export_merge_tree_partition_max_retries = 1, allow_insert_into_iceberg = 1, "
+ f"export_merge_tree_part_allow_lossy_cast = 1"
)
wait_for_export_status(node, mt_table, iceberg_table, "2020", "FAILED", timeout=60)
From 4100ecd322666697c5c6685ded4e53f1588091f9 Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Mon, 15 Jun 2026 15:45:22 -0300
Subject: [PATCH 08/33] fix tests
---
...rge_tree_part_to_object_storage_simple.sql | 21 +++++++------------
...rge_tree_part_to_object_storage_simple.sql | 20 ++++++++----------
2 files changed, 17 insertions(+), 24 deletions(-)
diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
index e40c5ebde461..8200233a7322 100644
--- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
@@ -45,31 +45,26 @@ ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_mat
CREATE TABLE 03572_partition_type_mismatch_mt (id UInt64, year String) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
CREATE TABLE 03572_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
--- The lossy-cast check fires synchronously BEFORE the part lookup,
--- so we don't need to insert data nor use a real part name to provoke it.
-ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_partition_type_mismatch_s3
+ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_partition_type_mismatch_s3
SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
--- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values.
--- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a
--- fake part name is enough to provoke it.
CREATE TABLE 03572_lossy_mt (id Int64, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
CREATE TABLE 03572_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
-ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3
+ALTER TABLE 03572_lossy_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_lossy_s3
SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
--- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation
--- passes and execution proceeds to the part lookup, which fails on the fake part name.
-ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3
+-- With the acknowledgment setting enabled, the lossy cast passes validation and reaches the
+-- part lookup, which fails because the part does not exist.
+ALTER TABLE 03572_lossy_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_lossy_s3
SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART}
--- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema
--- validation passes and we reach the part lookup, which fails on the fake part name.
+-- A lossless widening cast (Int32 -> Int64) passes validation without the setting and reaches
+-- the part lookup, which fails because the part does not exist.
CREATE TABLE 03572_lossless_mt (id Int32, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
CREATE TABLE 03572_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
-ALTER TABLE 03572_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossless_s3
+ALTER TABLE 03572_lossless_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_lossless_s3
SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART}
DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3, 03572_lossy_mt, 03572_lossy_s3, 03572_lossless_mt, 03572_lossless_s3;
diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
index e20fc14a26a5..0a199755c40a 100644
--- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
@@ -27,29 +27,27 @@ ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_sche
CREATE TABLE 03572_rmt_partition_type_mismatch_mt (id UInt64, year String) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_pcol_type/03572_rmt_partition_type_mismatch_mt', 'replica1') PARTITION BY year ORDER BY tuple();
CREATE TABLE 03572_rmt_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
-ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_partition_type_mismatch_s3
+ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_partition_type_mismatch_s3
SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
--- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values.
--- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a
--- fake part name is enough to provoke it.
+-- A lossy cast on a non-partition column (Int64 -> Int32) is rejected synchronously by default.
CREATE TABLE 03572_rmt_lossy_mt (id Int64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossy/03572_rmt_lossy_mt', 'replica1') PARTITION BY year ORDER BY tuple();
CREATE TABLE 03572_rmt_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
-ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3
+ALTER TABLE 03572_rmt_lossy_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_lossy_s3
SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
--- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation
--- passes and execution proceeds to the part lookup, which fails on the fake part name.
-ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3
+-- With the acknowledgment setting enabled, the lossy cast passes validation and reaches the
+-- part lookup, which fails because the part does not exist.
+ALTER TABLE 03572_rmt_lossy_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_lossy_s3
SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART}
--- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema
--- validation passes and we reach the part lookup, which fails on the fake part name.
+-- A lossless widening cast (Int32 -> Int64) passes validation without the setting and reaches
+-- the part lookup, which fails because the part does not exist.
CREATE TABLE 03572_rmt_lossless_mt (id Int32, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossless/03572_rmt_lossless_mt', 'replica1') PARTITION BY year ORDER BY tuple();
CREATE TABLE 03572_rmt_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
-ALTER TABLE 03572_rmt_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossless_s3
+ALTER TABLE 03572_rmt_lossless_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_lossless_s3
SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART}
DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3, 03572_rmt_lossy_mt, 03572_rmt_lossy_s3, 03572_rmt_lossless_mt, 03572_rmt_lossless_s3;
From 914de7283facdfb4c456807b26403ccd76c1a578 Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Mon, 15 Jun 2026 17:51:54 -0300
Subject: [PATCH 09/33] some more test fixes
---
.../test.py | 16 --
.../test.py | 246 +++++++++++++-----
2 files changed, 185 insertions(+), 77 deletions(-)
diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
index 4a8514ef3ae1..486adf1f2b17 100644
--- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
+++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
@@ -553,22 +553,6 @@ def test_export_part_writes_column_statistics(cluster):
node.query(f"DROP TABLE IF EXISTS {iceberg}")
-# ---------------------------------------------------------------------------
-# Schema compatibility (INSERT SELECT mirror)
-#
-# EXPORT PART builds an ActionsDAG via makeConvertingActions(Position) — the
-# same primitive that powers INSERT INTO dest SELECT * FROM src. The tests
-# below pin that contract:
-#
-# * Column counts must match positionally; otherwise the ALTER is rejected
-# synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing lands.
-# * Destination column names need not match source names — positional pairs
-# are CAST element-wise.
-# * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at
-# validation time unless export_merge_tree_part_allow_lossy_cast = 1.
-# ---------------------------------------------------------------------------
-
-
def test_export_part_column_count_mismatch_source_more_is_rejected(cluster):
"""
Source has 3 columns (id, year, extra), destination has 2 (id, year).
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 8e3f055c4317..09cf1c2ff467 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -746,40 +746,6 @@ def test_partition_key_compatibility_check(cluster):
)
-def test_export_partition_partition_column_lossless_widening(cluster):
- """A lossless widening of a partition column (year Int32 -> Int64) round-trips."""
- node = cluster.instances["replica1"]
-
- uid = unique_suffix()
- mt_table = f"mt_pcol_widening_{uid}"
- iceberg_table = f"iceberg_pcol_widening_{uid}"
-
- # Source uses year Int32; destination Iceberg uses year Int64.
- make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
- node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020), (3, 2020)")
-
- make_iceberg_s3(
- node, iceberg_table, "id Int64, year Int64",
- partition_by="year", s3_retry_attempts=3,
- )
-
- node.query(
- f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
- settings={"allow_insert_into_iceberg": 1},
- )
- wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
-
- count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
- assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}"
-
- result = node.query(
- f"SELECT id, toTypeName(year), year FROM {iceberg_table} ORDER BY id"
- ).strip()
- assert result == "1\tInt64\t2020\n2\tInt64\t2020\n3\tInt64\t2020", (
- f"Unexpected widened partition-column data:\n{result}"
- )
-
-
def test_export_ttl(cluster):
"""
After a manifest TTL expires the same partition can be re-exported, and the
@@ -1020,24 +986,6 @@ def test_export_partition_writes_column_statistics(cluster):
assert_exported_stats(entries)
-# ---------------------------------------------------------------------------
-# Schema compatibility (INSERT SELECT mirror)
-#
-# EXPORT PARTITION builds an ActionsDAG via makeConvertingActions(Position) — the
-# same primitive that powers INSERT INTO dest SELECT * FROM src. The destination
-# compatibility validation in StorageReplicatedMergeTree::exportPartitionToTable
-# mirrors what the plain-MergeTree EXPORT PART path does in MergeTreeData, so the
-# tests below pin the same contract for the replicated partition export:
-#
-# * Column counts must match positionally; otherwise the ALTER is rejected
-# synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing is scheduled.
-# * Destination column names need not match source names — positional pairs
-# are CAST element-wise.
-# * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at
-# validation time unless export_merge_tree_part_allow_lossy_cast = 1.
-# ---------------------------------------------------------------------------
-
-
def test_export_partition_column_count_mismatch_source_more_is_rejected(cluster):
"""
Source has 3 columns (id, year, extra), destination has 2 (id, year).
@@ -1162,12 +1110,8 @@ def test_export_partition_with_renamed_destination_column(cluster):
def test_export_partition_with_castable_widening(cluster):
- """
- Source column is Int32, destination expects Int64. makeConvertingActions
- inserts a widening CAST; the export must succeed and the values must land
- as Int64 in Iceberg. The partition column `year` keeps the same Int32 type
- on both sides so the exact-type partition check is satisfied.
- """
+ """A lossless widening of both a data column (id Int32 -> Int64) and the
+ partition column (year Int32 -> Int64) round-trips."""
node = cluster.instances["replica1"]
uid = unique_suffix()
@@ -1177,7 +1121,7 @@ def test_export_partition_with_castable_widening(cluster):
make_rmt(node, mt_table, "id Int32, year Int32", "year", replica_name="replica1")
node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020)")
- make_iceberg_s3(node, iceberg_table, "id Int64, year Int32", partition_by="year")
+ make_iceberg_s3(node, iceberg_table, "id Int64, year Int64", partition_by="year")
node.query(
f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
@@ -1189,9 +1133,9 @@ def test_export_partition_with_castable_widening(cluster):
assert count == 2, f"Expected 2 rows in Iceberg table after export, got {count}"
result = node.query(
- f"SELECT id, toTypeName(id), year FROM {iceberg_table} ORDER BY id"
+ f"SELECT id, toTypeName(id), year, toTypeName(year) FROM {iceberg_table} ORDER BY id"
).strip()
- assert result == "1\tInt64\t2020\n2\tInt64\t2020", (
+ assert result == "1\tInt64\t2020\tInt64\n2\tInt64\t2020\tInt64", (
f"Unexpected widened data:\n{result}"
)
@@ -1230,6 +1174,31 @@ def test_export_partition_with_castable_narrowing_values_fit(cluster):
)
+def test_export_partition_lossy_cast_rejected_without_optin(cluster):
+ """A lossy narrowing (id Int64 -> Int32) is rejected synchronously with
+ BAD_ARGUMENTS unless export_merge_tree_part_allow_lossy_cast is set."""
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_lossy_reject_{uid}"
+ iceberg_table = f"iceberg_lossy_reject_{uid}"
+
+ make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
+ node.query(f"INSERT INTO {mt_table} VALUES (1, 2020)")
+
+ make_iceberg_s3(node, iceberg_table, "id Int32, year Int32", partition_by="year")
+
+ error = node.query_and_get_error(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} "
+ f"SETTINGS allow_insert_into_iceberg = 1"
+ )
+ assert "BAD_ARGUMENTS" in error, f"Expected BAD_ARGUMENTS, got: {error!r}"
+ assert "lossy cast" in error, f"Expected 'lossy cast' in error, got: {error!r}"
+
+ count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+ assert count == 0, f"Expected no rows after a rejected export, got {count}"
+
+
def test_export_partition_runtime_cast_failure_propagates_async(cluster):
"""A String value that cannot be parsed as the destination Int32 passes the
synchronous lossy-cast gate (with export_merge_tree_part_allow_lossy_cast = 1) but
@@ -1272,3 +1241,158 @@ def test_export_partition_runtime_cast_failure_propagates_async(cluster):
assert count == 0, (
f"Expected 0 rows in Iceberg table after failed export, got {count}"
)
+
+
+def test_export_partition_all_iceberg_types(cluster):
+ """Every getIcebergType-supported type round-trips through an EXPORT PARTITION:
+ scalars use narrower source types (explicit lossless widening CASTs), plus
+ Array/Map/Tuple nested columns."""
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_all_types_{uid}"
+ iceberg_table = f"iceberg_all_types_{uid}"
+
+ # Scalar source types are strictly narrower than the destination; the export inserts
+ # a positional widening CAST per column (Int8->Int16, UInt32->UInt64, ...). Nested
+ # columns keep the same type on both sides.
+ source_columns = (
+ "i16 Int8, u16 UInt8, u32 UInt16, u64 UInt32, "
+ "id Int16, big Int32, f32 Float32, f64 Float64, "
+ "d Date, d32 Date32, dt DateTime, dt64 DateTime64(6), "
+ "s String, uid UUID, "
+ "arr Array(Int32), m Map(String, Int64), tup Tuple(a Int32, b String), "
+ "year Int32"
+ )
+ dest_columns = (
+ "i16 Int16, u16 UInt16, u32 UInt32, u64 UInt64, "
+ "id Int32, big Int64, f32 Float32, f64 Float64, "
+ "d Date, d32 Date32, dt DateTime, dt64 DateTime64(6), "
+ "s String, uid UUID, "
+ "arr Array(Int32), m Map(String, Int64), tup Tuple(a Int32, b String), "
+ "year Int32"
+ )
+
+ make_rmt(node, mt_table, source_columns, "year", replica_name="replica1")
+ make_iceberg_s3(node, iceberg_table, dest_columns, partition_by="year")
+
+ node.query(
+ f"""
+ INSERT INTO {mt_table}
+ (i16, u16, u32, u64, id, big, f32, f64, d, d32, dt, dt64, s, uid, arr, m, tup, year)
+ VALUES (
+ -100, 200, 50000, 4000000000,
+ 12345, 1000000000, 3.14, 2.718281828459045,
+ '2024-01-15', '2024-01-15', '2024-01-15 12:30:45', '2024-01-15 12:30:45.123456',
+ 'hello iceberg', '550e8400-e29b-41d4-a716-446655440000',
+ [1, 2, 3], {{'a': 10, 'b': 20}}, (7, 'seven'), 2024
+ )
+ """
+ )
+
+ node.query(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2024' TO TABLE {iceberg_table}",
+ settings={"allow_insert_into_iceberg": 1},
+ )
+ wait_for_export_status(node, mt_table, iceberg_table, "2024", "COMPLETED")
+
+ count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+ assert count == 1, f"Expected 1 row in Iceberg table, got {count}"
+
+ result = node.query(
+ f"""
+ SELECT
+ i16, u16, u32, u64, id, big,
+ toString(d), toString(d32), toString(dt),
+ s, toString(uid),
+ arr, m['a'], m['b'], tup.a, tup.b, year
+ FROM {iceberg_table}
+ """
+ ).strip()
+ expected = "\t".join([
+ "-100", "200", "50000", "4000000000",
+ "12345", "1000000000",
+ "2024-01-15", "2024-01-15", "2024-01-15 12:30:45.000000",
+ "hello iceberg", "550e8400-e29b-41d4-a716-446655440000",
+ "[1,2,3]", "10", "20", "7", "seven", "2024",
+ ])
+ assert result == expected, f"Unexpected round-trip data:\n{result!r}\nexpected:\n{expected!r}"
+
+ # Floats compared with a tolerance to avoid formatting flakiness.
+ floats_ok = node.query(
+ f"SELECT abs(f32 - 3.14) < 1e-4 AND abs(f64 - 2.718281828459045) < 1e-12 FROM {iceberg_table}"
+ ).strip()
+ assert floats_ok == "1", f"Float round-trip outside tolerance: {floats_ok!r}"
+
+ # DateTime64 sub-second component: assert the date part is preserved (exact format varies).
+ ts_result = node.query(f"SELECT dt64 FROM {iceberg_table}").strip()
+ assert "2024-01-15" in ts_result, f"DateTime64 date component missing: {ts_result!r}"
+
+
+def test_export_partition_all_iceberg_types_lossy(cluster):
+ """Lossy narrowing casts across types succeed with the opt-in flag: values that
+ fit round-trip, Float64 -> Float32 loses precision, and Nullable columns carry
+ both NULL and non-NULL (the latter via a lossy Nullable(Int64) -> Nullable(Int32))."""
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_lossy_types_{uid}"
+ iceberg_table = f"iceberg_lossy_types_{uid}"
+
+ # Each source column is wider than the destination, so the export inserts a lossy
+ # narrowing CAST (allowed only because export_merge_tree_part_allow_lossy_cast=1).
+ # Int8/UInt8 are not Iceberg-representable, so the narrowest integer dest is Int16.
+ source_columns = (
+ "big Int64, ubig UInt64, mid Int32, "
+ "f Float64, dt DateTime64(6), d Date32, "
+ "opt_s Nullable(String), opt_i Nullable(Int64), year Int32"
+ )
+ dest_columns = (
+ "big Int32, ubig UInt32, mid Int16, "
+ "f Float32, dt DateTime, d Date, "
+ "opt_s Nullable(String), opt_i Nullable(Int32), year Int32"
+ )
+
+ make_rmt(node, mt_table, source_columns, "year", replica_name="replica1")
+ make_iceberg_s3(node, iceberg_table, dest_columns, partition_by="year")
+
+ # Values chosen to fit the destination types (the async cast wraps on overflow
+ # rather than throwing, so out-of-range values would silently corrupt instead).
+ # opt_s is NULL and opt_i is set, covering both nullable paths in one row.
+ node.query(
+ f"""
+ INSERT INTO {mt_table} (big, ubig, mid, f, dt, d, opt_s, opt_i, year)
+ VALUES (
+ 1000000, 2000000000, 30000,
+ 2.718281828459045, '2024-01-15 12:30:45.123456', '2024-01-15',
+ NULL, 100, 2024
+ )
+ """
+ )
+
+ node.query(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2024' TO TABLE {iceberg_table}",
+ settings={
+ "allow_insert_into_iceberg": 1,
+ "export_merge_tree_part_allow_lossy_cast": 1,
+ },
+ )
+ wait_for_export_status(node, mt_table, iceberg_table, "2024", "COMPLETED")
+
+ count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+ assert count == 1, f"Expected 1 row in Iceberg table, got {count}"
+
+ result = node.query(
+ f"SELECT big, ubig, mid, toString(d), toString(dt), opt_s, opt_i, year FROM {iceberg_table}"
+ ).strip()
+ expected = "\t".join([
+ "1000000", "2000000000", "30000",
+ "2024-01-15", "2024-01-15 12:30:45.000000", "\\N", "100", "2024",
+ ])
+ assert result == expected, f"Unexpected lossy round-trip data:\n{result!r}\nexpected:\n{expected!r}"
+
+ # Float64 -> Float32 stays within Float32 precision but is no longer exact.
+ f_checks = node.query(
+ f"SELECT abs(f - 2.718281828459045) < 1e-6, abs(f - 2.718281828459045) > 1e-9 FROM {iceberg_table}"
+ ).strip()
+ assert f_checks == "1\t1", f"Expected Float32 precision loss within tolerance, got: {f_checks!r}"
From 6f822c8074bcf9aeea7544024a5f226a429a1cfb Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Mon, 15 Jun 2026 17:55:35 -0300
Subject: [PATCH 10/33] add docs
---
docs/en/antalya/part_export.md | 6 ++++++
docs/en/antalya/partition_export.md | 6 ++++++
2 files changed, 12 insertions(+)
diff --git a/docs/en/antalya/part_export.md b/docs/en/antalya/part_export.md
index 03f9f479991b..b73efa277e2a 100644
--- a/docs/en/antalya/part_export.md
+++ b/docs/en/antalya/part_export.md
@@ -107,6 +107,12 @@ In case a table function is used as the destination, the schema can be omitted a
- **Default**: `{part_name}_{checksum}`
- **Description**: Pattern for the filename of the exported merge tree part. The `part_name` and `checksum` are calculated and replaced on the fly. Additional macros are supported.
+### `export_merge_tree_part_allow_lossy_cast` (Optional)
+
+- **Type**: `Bool`
+- **Default**: `false`
+- **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
+
## Examples
diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md
index 975915859482..9bd1dfcd16f1 100644
--- a/docs/en/antalya/partition_export.md
+++ b/docs/en/antalya/partition_export.md
@@ -111,6 +111,12 @@ Notes:
- Enforcement is best-effort: actual kill latency is bounded by one manifest-updater poll cycle (~30s) plus ZooKeeper watch propagation.
- Since both this timeout and `export_merge_tree_partition_manifest_ttl` are measured from `create_time`, keep `export_merge_tree_partition_manifest_ttl` greater than `export_merge_tree_partition_task_timeout_seconds` if you want the KILLED entry to remain visible in `system.replicated_partition_exports` after the timeout fires.
+### `export_merge_tree_part_allow_lossy_cast` (Optional)
+
+- **Type**: `Bool`
+- **Default**: `false`
+- **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
+
## Examples
### Basic Export to S3
From f521a95c87278262e89bae8c6ec15218c32ced00 Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Tue, 16 Jun 2026 09:59:40 -0300
Subject: [PATCH 11/33] improve docs with a warning
---
docs/en/antalya/part_export.md | 2 ++
docs/en/antalya/partition_export.md | 2 ++
src/Core/Settings.cpp | 2 ++
3 files changed, 6 insertions(+)
diff --git a/docs/en/antalya/part_export.md b/docs/en/antalya/part_export.md
index b73efa277e2a..88e1c831a536 100644
--- a/docs/en/antalya/part_export.md
+++ b/docs/en/antalya/part_export.md
@@ -113,6 +113,8 @@ In case a table function is used as the destination, the schema can be omitted a
- **Default**: `false`
- **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
+ **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value.
+
## Examples
diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md
index 9bd1dfcd16f1..5a5dd2918869 100644
--- a/docs/en/antalya/partition_export.md
+++ b/docs/en/antalya/partition_export.md
@@ -117,6 +117,8 @@ Notes:
- **Default**: `false`
- **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
+ **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value.
+
## Examples
### Basic Export to S3
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 6368ec01c1cb..5fcf971ecc56 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -7608,6 +7608,8 @@ Pattern for the filename of the exported merge tree part. The `part_name` and `c
)", 0) \
DECLARE(Bool, export_merge_tree_part_allow_lossy_cast, false, R"(
Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
+
+Warning: the partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Lossy casts on a partition column can therefore make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the metadata will still contain the original `Int64` values while the data files contain the truncated `Int32` values.
)", 0) \
\
/* ####################################################### */ \
From 41c64eda6f35eeedb14601435f130e0ccad78993 Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Tue, 16 Jun 2026 21:20:03 -0300
Subject: [PATCH 12/33] hmm... maybe incomplete, gotta continue it tomorrow
---
docs/en/antalya/part_export.md | 4 +-
docs/en/antalya/partition_export.md | 4 +-
src/Core/Settings.cpp | 2 +-
...portReplicatedMergeTreePartitionManifest.h | 7 +++
src/Storages/IStorage.h | 10 ++--
src/Storages/MergeTree/ExportPartTask.cpp | 2 +-
.../MergeTree/ExportPartitionUtils.cpp | 18 +++---
src/Storages/MergeTree/ExportPartitionUtils.h | 7 ++-
.../DataLakes/IDataLakeMetadata.h | 2 +-
.../DataLakes/Iceberg/IcebergMetadata.cpp | 58 ++++++++++++++++++-
.../DataLakes/Iceberg/IcebergMetadata.h | 2 +-
.../ObjectStorage/StorageObjectStorage.cpp | 5 +-
src/Storages/StorageReplicatedMergeTree.cpp | 2 +
13 files changed, 98 insertions(+), 25 deletions(-)
diff --git a/docs/en/antalya/part_export.md b/docs/en/antalya/part_export.md
index 88e1c831a536..73d467c5d9b1 100644
--- a/docs/en/antalya/part_export.md
+++ b/docs/en/antalya/part_export.md
@@ -113,7 +113,9 @@ In case a table function is used as the destination, the schema can be omitted a
- **Default**: `false`
- **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
- **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value.
+ When exporting to Apache Iceberg, the partition value written to the metadata is derived from the source partition columns by casting them to the destination partition-field types and applying the destination partition transform — the same computation the exported data files use. This keeps the Iceberg metadata consistent with the data files.
+
+ **Warning:** A lossy cast on a partition column remains semantically truncating. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into a destination `Int32` partition column, both the data files and the Iceberg metadata will contain the truncated `Int32` value (they agree with each other, but the original `Int64` value is lost). Such casts require `export_merge_tree_part_allow_lossy_cast = 1`.
## Examples
diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md
index 5a5dd2918869..bb17a57f87cd 100644
--- a/docs/en/antalya/partition_export.md
+++ b/docs/en/antalya/partition_export.md
@@ -117,7 +117,9 @@ Notes:
- **Default**: `false`
- **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
- **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value.
+ When exporting to Apache Iceberg, the partition value written to the metadata is derived from the source partition columns by casting them to the destination partition-field types and applying the destination partition transform — the same computation the exported data files use. This keeps the Iceberg metadata consistent with the data files.
+
+ **Warning:** A lossy cast on a partition column remains semantically truncating. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into a destination `Int32` partition column, both the data files and the Iceberg metadata will contain the truncated `Int32` value (they agree with each other, but the original `Int64` value is lost). Such casts require `export_merge_tree_part_allow_lossy_cast = 1`.
## Examples
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 5fcf971ecc56..d905075c5b71 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -7609,7 +7609,7 @@ Pattern for the filename of the exported merge tree part. The `part_name` and `c
DECLARE(Bool, export_merge_tree_part_allow_lossy_cast, false, R"(
Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
-Warning: the partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Lossy casts on a partition column can therefore make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the metadata will still contain the original `Int64` values while the data files contain the truncated `Int32` values.
+When exporting to Apache Iceberg, the partition value written to the metadata is derived from the source partition columns by casting them to the destination partition-field types and applying the destination partition transform — the same computation the exported data files use, so the metadata stays consistent with the data. A lossy cast on a partition column remains semantically truncating: both the data files and the metadata contain the truncated value, and such casts require this setting to be enabled.
)", 0) \
\
/* ####################################################### */ \
diff --git a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
index acfabc28ca61..db26836bdb2e 100644
--- a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
+++ b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
@@ -174,6 +174,7 @@ struct ExportReplicatedMergeTreePartitionManifest
MergeTreePartExportManifest::FileAlreadyExistsPolicy file_already_exists_policy;
String filename_pattern;
bool write_full_path_in_iceberg_metadata = false;
+ bool allow_lossy_cast = false;
String iceberg_metadata_json;
std::string toJsonString() const
@@ -208,6 +209,7 @@ struct ExportReplicatedMergeTreePartitionManifest
json.set("ttl_seconds", ttl_seconds);
json.set("task_timeout_seconds", task_timeout_seconds);
json.set("write_full_path_in_iceberg_metadata", write_full_path_in_iceberg_metadata);
+ json.set("allow_lossy_cast", allow_lossy_cast);
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
oss.exceptions(std::ios::failbit);
Poco::JSON::Stringifier::stringify(json, oss);
@@ -262,6 +264,11 @@ struct ExportReplicatedMergeTreePartitionManifest
manifest.write_full_path_in_iceberg_metadata = json->getValue("write_full_path_in_iceberg_metadata");
+ /// Default to true for tasks created before this field existed, so an in-flight
+ /// export scheduled with the old permissive worker behavior is not wrongly rejected
+ /// on upgrade. New tasks always persist the initiator's actual choice.
+ manifest.allow_lossy_cast = json->has("allow_lossy_cast") ? json->getValue("allow_lossy_cast") : true;
+
return manifest;
}
};
diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h
index 0a07b892cef8..93caf3f65ebe 100644
--- a/src/Storages/IStorage.h
+++ b/src/Storages/IStorage.h
@@ -504,11 +504,11 @@ It is currently only implemented in StorageObjectStorage.
struct IcebergCommitExportPartitionArguments
{
std::string metadata_json_string;
- /// Partition column values (after transforms). Callers are responsible for
- /// populating this: the partition-export path parses them from the persisted
- /// JSON string, while the direct EXPORT PART path reads them from the part's
- /// partition key.
- std::vector partition_values;
+ /// Representative source partition-key columns from one exported part (the part's
+ /// minmax block). The destination derives the Iceberg partition tuple from a row of
+ /// this block by casting to the destination column types and applying the partition
+ /// transform, so the metadata partition value matches the exported data files.
+ Block partition_source_block;
};
virtual void commitExportPartitionTransaction(
diff --git a/src/Storages/MergeTree/ExportPartTask.cpp b/src/Storages/MergeTree/ExportPartTask.cpp
index f2035577bbf6..e626e7364eaa 100644
--- a/src/Storages/MergeTree/ExportPartTask.cpp
+++ b/src/Storages/MergeTree/ExportPartTask.cpp
@@ -332,7 +332,7 @@ bool ExportPartTask::executeStep()
{
IStorage::IcebergCommitExportPartitionArguments iceberg_args;
iceberg_args.metadata_json_string = manifest.iceberg_metadata_json;
- iceberg_args.partition_values = manifest.data_part->partition.value;
+ iceberg_args.partition_source_block = block_with_partition_values;
destination_storage->commitExportPartitionTransaction(
manifest.transaction_id,
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp
index c902a4042218..679e07d0b132 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.cpp
+++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp
@@ -57,14 +57,13 @@ namespace fs = std::filesystem;
namespace ExportPartitionUtils
{
- std::vector getPartitionValuesForIcebergCommit(
+ Block getPartitionSourceBlockForIcebergCommit(
MergeTreeData & storage, const String & partition_id)
{
auto lock = storage.readLockParts();
const auto parts = storage.getDataPartsVectorInPartitionForInternalUsage(
MergeTreeDataPartState::Active, partition_id, lock);
-
- /// todo arthur: bad arguments for now, pick a better one
+
if (parts.empty())
throw Exception(ErrorCodes::NO_SUCH_DATA_PART,
"Cannot find active part for partition_id '{}' to derive Iceberg partition "
@@ -72,7 +71,7 @@ namespace ExportPartitionUtils
"or this replica has not yet received any part for this partition. "
"The commit will be retried.",
partition_id);
- return parts.front()->partition.value;
+ return parts.front()->minmax_idx->getBlock(storage);
}
ContextPtr getContextCopyWithTaskSettings(const ContextPtr & context, const ExportReplicatedMergeTreePartitionManifest & manifest)
@@ -102,8 +101,11 @@ namespace ExportPartitionUtils
/// stalls when the setting is only set at the query level.
context_copy->setSetting("allow_insert_into_iceberg", true);
- /// This setting, just like allow_insert_into_iceberg, has been verified at request time. At this point, we allow everything.
- context_copy->setSetting("export_merge_tree_part_allow_lossy_cast", true);
+ /// Reapply the initiator's lossy-cast decision (persisted in the manifest) so the
+ /// worker's schema revalidation honors the user's choice. Without this, a task
+ /// scheduled without the opt-in could still apply a lossy cast if the destination
+ /// schema drifts to a lossy target between scheduling and execution.
+ context_copy->setSetting("export_merge_tree_part_allow_lossy_cast", manifest.allow_lossy_cast);
return context_copy;
}
@@ -217,8 +219,8 @@ namespace ExportPartitionUtils
{
iceberg_args.metadata_json_string = manifest.iceberg_metadata_json;
if (source_storage.getInMemoryMetadataPtr()->hasPartitionKey())
- iceberg_args.partition_values =
- getPartitionValuesForIcebergCommit(source_storage, manifest.partition_id);
+ iceberg_args.partition_source_block =
+ getPartitionSourceBlockForIcebergCommit(source_storage, manifest.partition_id);
}
destination_storage->commitExportPartitionTransaction(manifest.transaction_id, manifest.partition_id, exported_paths, iceberg_args, context);
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h
index d97bb538a47f..0434bc59a2cb 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.h
+++ b/src/Storages/MergeTree/ExportPartitionUtils.h
@@ -27,14 +27,15 @@ namespace ExportPartitionUtils
ContextPtr getContextCopyWithTaskSettings(const ContextPtr & context, const ExportReplicatedMergeTreePartitionManifest & manifest);
- /// Returns the partition key values for the given partition_id by reading from
- /// the first active local part. Throws LOGICAL_ERROR if no such part is found.
+ /// Returns the representative source partition-key columns (the first active local part's
+ /// minmax block) for the given partition_id. The destination recomputes the Iceberg partition
+ /// tuple from this block by casting to its column types and applying the partition transform.
///
/// Edge case: if the partition was dropped after export started, or this replica
/// has not yet received any part for this partition (extreme replication lag on a
/// recovery path), no active part will be found and the commit will fail. The task
/// will be retried on the next poll cycle or picked up by a different replica.
- std::vector getPartitionValuesForIcebergCommit(
+ Block getPartitionSourceBlockForIcebergCommit(
MergeTreeData & storage, const String & partition_id);
void commit(
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
index 85c91ba5de19..a90c49b35210 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
@@ -226,7 +226,7 @@ class IDataLakeMetadata : boost::noncopyable
const String & /* transaction_id */,
Int64 /* original_schema_id */,
Int64 /* partition_spec_id */,
- const std::vector & /* partition_values */,
+ const Block & /* partition_source_block */,
SharedHeader /* sample_block */,
const std::vector & /* data_file_paths */,
ContextPtr /* context */)
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 6016be1cd341..9819a8783516 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -30,6 +30,7 @@
#include
#include
+#include
#include
#include
@@ -1524,6 +1525,57 @@ Poco::JSON::Object::Ptr lookupSchema(const Poco::JSON::Object::Ptr & meta, Int64
"Schema with id {} not found in table metadata", schema_id);
}
+/// Derive the Iceberg partition tuple for an exported part from a representative source row.
+/// The MergeTree `partition.value` is the source partition-key expression result; it is neither
+/// cast to the destination column types nor expressed through the Iceberg transform, so it must
+/// not be written to metadata directly. Within a MergeTree partition the transform result is
+/// constant, so a single representative value per partition-source column (taken from the part's
+/// minmax block) suffices: cast it to the destination column type and run the same transform the
+/// data uses. The result is transform-correct and consistent with the exported data files.
+std::vector recomputeExportPartitionValues(
+ ChunkPartitioner & partitioner,
+ const SharedHeader & sample_block,
+ const Block & partition_source_block)
+{
+ const auto & partition_columns = partitioner.getColumns();
+ if (partition_columns.empty())
+ return {};
+
+ for (const auto & column_name : partition_columns)
+ if (!partition_source_block.has(column_name))
+ throw Exception(ErrorCodes::LOGICAL_ERROR,
+ "Partition source column '{}' required by the Iceberg partition transform is missing "
+ "from the representative source block while committing an export.", column_name);
+
+ Columns columns;
+ columns.reserve(sample_block->columns());
+ for (size_t i = 0; i < sample_block->columns(); ++i)
+ {
+ const auto & dest_column = sample_block->getByPosition(i);
+ if (partition_source_block.has(dest_column.name))
+ {
+ const auto & source = partition_source_block.getByName(dest_column.name);
+ ColumnWithTypeAndName representative{source.column->cut(0, 1), source.type, source.name};
+ columns.push_back(castColumn(representative, dest_column.type));
+ }
+ else
+ {
+ auto column = dest_column.type->createColumn();
+ column->insertDefault();
+ columns.push_back(std::move(column));
+ }
+ }
+
+ auto partitioned = partitioner.partitionChunk(Chunk(std::move(columns), 1));
+ if (partitioned.size() != 1)
+ throw Exception(ErrorCodes::LOGICAL_ERROR,
+ "Recomputing Iceberg partition values produced {} partitions for a single representative row; "
+ "a MergeTree partition must map to exactly one Iceberg partition.", partitioned.size());
+
+ const auto & key = partitioned.front().first;
+ return std::vector(key.begin(), key.end());
+}
+
}
bool IcebergMetadata::commitImportPartitionTransactionImpl(
@@ -1826,7 +1878,7 @@ void IcebergMetadata::commitExportPartitionTransaction(
const String & transaction_id,
Int64 original_schema_id,
Int64 partition_spec_id,
- const std::vector & partition_values,
+ const Block & partition_source_block,
SharedHeader sample_block,
const std::vector & data_file_paths,
ContextPtr context)
@@ -1895,6 +1947,10 @@ void IcebergMetadata::commitExportPartitionTransaction(
const auto partition_columns = partitioner.getColumns();
const auto partition_types = partitioner.getResultTypes();
+ /// Recompute the partition tuple via the destination transform so the metadata partition
+ /// value matches the exported data (rather than the raw source MergeTree partition value).
+ const auto partition_values = recomputeExportPartitionValues(partitioner, sample_block, partition_source_block);
+
const auto metadata_compression_method = persistent_components.metadata_compression_method;
FileNamesGenerator filename_generator = FileNamesGenerator(
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 2e28df93e0e6..2ec8efe4a9b7 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -141,7 +141,7 @@ class IcebergMetadata : public IDataLakeMetadata
const String & transaction_id,
Int64 original_schema_id,
Int64 partition_spec_id,
- const std::vector & partition_values,
+ const Block & partition_source_block,
SharedHeader sample_block,
const std::vector & data_file_paths,
ContextPtr context) override;
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
index c08db678c46c..50f8c373767d 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
@@ -680,7 +680,8 @@ void StorageObjectStorage::commitExportPartitionTransaction(
/// Parse the Iceberg metadata snapshot (stored in ZooKeeper at export-start time) only to
/// extract the schema-id and partition-spec-id that were current when the export began.
/// partition_columns and partition_types are derived inside commitExportPartitionTransaction
- /// from the same JSON, so only partition_values need to be carried here.
+ /// from the same JSON; the representative source partition columns are carried here so the
+ /// partition tuple can be recomputed through the destination transform.
Poco::JSON::Parser iceberg_parser;
Poco::JSON::Object::Ptr iceberg_metadata =
iceberg_parser.parse(iceberg_commit_export_partition_arguments.metadata_json_string).extract();
@@ -695,7 +696,7 @@ void StorageObjectStorage::commitExportPartitionTransaction(
transaction_id,
original_schema_id,
partition_spec_id,
- iceberg_commit_export_partition_arguments.partition_values,
+ iceberg_commit_export_partition_arguments.partition_source_block,
std::make_shared(getInMemoryMetadataPtr()->getSampleBlock()),
exported_paths,
local_context);
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 2301ec1c1a30..10eee4e5db95 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -225,6 +225,7 @@ namespace Setting
extern const SettingsUInt64 export_merge_tree_part_max_rows_per_file;
extern const SettingsBool export_merge_tree_part_throw_on_pending_mutations;
extern const SettingsBool export_merge_tree_part_throw_on_pending_patch_parts;
+ extern const SettingsBool export_merge_tree_part_allow_lossy_cast;
extern const SettingsExportPartitionAllOnError export_merge_tree_partition_all_on_error;
extern const SettingsString export_merge_tree_part_filename_pattern;
extern const SettingsBool write_full_path_in_iceberg_metadata;
@@ -8537,6 +8538,7 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
manifest.file_already_exists_policy = query_context->getSettingsRef()[Setting::export_merge_tree_part_file_already_exists_policy].value;
manifest.filename_pattern = query_context->getSettingsRef()[Setting::export_merge_tree_part_filename_pattern].value;
manifest.write_full_path_in_iceberg_metadata = query_context->getSettingsRef()[Setting::write_full_path_in_iceberg_metadata];
+ manifest.allow_lossy_cast = query_context->getSettingsRef()[Setting::export_merge_tree_part_allow_lossy_cast];
if (dest_storage->isDataLake())
{
From 3b177ed647fe9c74fabe99930d7b8e89c9c34985 Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Wed, 17 Jun 2026 09:51:12 -0300
Subject: [PATCH 13/33] add tests covering corner tricky case
---
.../test.py | 246 ++++++++++++++++++
1 file changed, 246 insertions(+)
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 09cf1c2ff467..00cf5b5ff6f2 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -1396,3 +1396,249 @@ def test_export_partition_all_iceberg_types_lossy(cluster):
f"SELECT abs(f - 2.718281828459045) < 1e-6, abs(f - 2.718281828459045) > 1e-9 FROM {iceberg_table}"
).strip()
assert f_checks == "1\t1", f"Expected Float32 precision loss within tolerance, got: {f_checks!r}"
+
+
+def _data_file_partition_records(entries):
+ """Partition dicts of the non-delete data files described by manifest entries."""
+ records = []
+ for entry in entries:
+ data_file = entry.get("data_file") or {}
+ if data_file.get("content", 0) not in (0, None):
+ continue
+ partition = data_file.get("partition")
+ if partition is not None:
+ records.append(partition)
+ return records
+
+
+def _partition_scalar(partition, field):
+ """Read a partition field value, tolerating an Avro-union ``{type: value}`` wrapper."""
+ value = partition.get(field)
+ if isinstance(value, dict):
+ assert len(value) == 1, f"Unexpected partition union shape for {field!r}: {value!r}"
+ value = next(iter(value.values()))
+ return value
+
+
+def test_export_partition_bucket_transform_metadata_matches_data(cluster):
+ """A bucket[N] partition column whose type changes Int64 -> String records the
+ destination murmur(String) bucket in the Iceberg metadata, matching the exported
+ data rather than the source hashLong bucket."""
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_bucket_xform_{uid}"
+ iceberg_table = f"iceberg_bucket_xform_{uid}"
+
+ # N=16, key=42 diverges: icebergBucket(16, 42::Int64)=14 (source/old hashLong) but
+ # icebergBucket(16, '42')=6 (destination/new murmur over the exported String).
+ make_rmt(node, mt_table, "id Int64, key Int64", "icebergBucket(16, key)",
+ replica_name="replica1")
+ node.query(f"INSERT INTO {mt_table} VALUES (1, 42), (2, 42)")
+
+ make_iceberg_s3(node, iceberg_table, "id Int64, key String",
+ partition_by="icebergBucket(16, key)")
+
+ pid = first_partition_id(node, mt_table)
+ node.query(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}",
+ settings={"allow_insert_into_iceberg": 1},
+ )
+ wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED")
+
+ count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+ assert count == 2, f"Expected 2 rows after export, got {count}"
+
+ string_bucket = int(node.query(
+ f"SELECT DISTINCT icebergBucket(16, key) FROM {iceberg_table}"
+ ).strip())
+ long_bucket = int(node.query(
+ f"SELECT DISTINCT icebergBucket(16, toInt64(key)) FROM {iceberg_table}"
+ ).strip())
+ assert string_bucket != long_bucket, (
+ f"Test setup invalid: String and Int64 buckets coincide ({string_bucket}); "
+ f"pick a different N/key so the transform diverges."
+ )
+
+ query_id = f"bucket_xform_{uid}"
+ node.query(
+ f"SELECT * FROM {iceberg_table}",
+ query_id=query_id,
+ settings={"iceberg_metadata_log_level": "manifest_file_entry"},
+ )
+ entries = fetch_manifest_entries(node, query_id)
+ partitions = _data_file_partition_records(entries)
+ assert partitions, "No data-file partition records found in manifest entries"
+ meta_values = {int(_partition_scalar(p, "key")) for p in partitions}
+ assert meta_values == {string_bucket}, (
+ f"Metadata bucket {meta_values} must equal the destination String bucket "
+ f"{string_bucket} (not the source Int64 bucket {long_bucket})."
+ )
+
+
+def test_export_partition_month_transform_metadata_matches_data(cluster):
+ """A month-transform partition records a months-since-epoch value in metadata that
+ matches the value derived from the exported data, and a transform-filtered read
+ returns the rows."""
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_month_xform_{uid}"
+ iceberg_table = f"iceberg_month_xform_{uid}"
+
+ make_rmt(node, mt_table, "id Int64, event_date Date",
+ "toMonthNumSinceEpoch(event_date)", replica_name="replica1")
+ node.query(
+ f"INSERT INTO {mt_table} VALUES "
+ f"(1, '2024-03-05'), (2, '2024-03-20'), (3, '2024-03-31')"
+ )
+
+ make_iceberg_s3(node, iceberg_table, "id Int64, event_date Date",
+ partition_by="toMonthNumSinceEpoch(event_date)")
+
+ pid = first_partition_id(node, mt_table)
+ node.query(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}",
+ settings={"allow_insert_into_iceberg": 1},
+ )
+ wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED")
+
+ count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+ assert count == 3, f"Expected 3 rows after export, got {count}"
+
+ month_num = int(node.query(
+ f"SELECT DISTINCT toMonthNumSinceEpoch(event_date) FROM {iceberg_table}"
+ ).strip())
+
+ query_id = f"month_xform_{uid}"
+ node.query(
+ f"SELECT * FROM {iceberg_table}",
+ query_id=query_id,
+ settings={"iceberg_metadata_log_level": "manifest_file_entry"},
+ )
+ entries = fetch_manifest_entries(node, query_id)
+ partitions = _data_file_partition_records(entries)
+ assert partitions, "No data-file partition records found in manifest entries"
+ meta_values = {int(_partition_scalar(p, "event_date")) for p in partitions}
+ assert meta_values == {month_num}, (
+ f"Metadata month {meta_values} must equal toMonthNumSinceEpoch over the data "
+ f"({month_num})."
+ )
+
+ filtered = int(node.query(
+ f"SELECT count() FROM {iceberg_table} "
+ f"WHERE toMonthNumSinceEpoch(event_date) = {month_num}"
+ ).strip())
+ assert filtered == 3, f"Transform-filtered read expected 3 rows, got {filtered}"
+
+
+def test_export_partition_identity_type_change_metadata_matches_data(cluster):
+ """An identity partition column whose type changes UInt16 -> String records the
+ destination String value in the Iceberg metadata, matching the exported data."""
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_identity_xform_{uid}"
+ iceberg_table = f"iceberg_identity_xform_{uid}"
+
+ make_rmt(node, mt_table, "id Int32, year UInt16", "year", replica_name="replica1")
+ node.query(f"INSERT INTO {mt_table} VALUES (1, 2024), (2, 2024)")
+
+ make_iceberg_s3(node, iceberg_table, "id Int32, year String", partition_by="year")
+
+ pid = first_partition_id(node, mt_table)
+ node.query(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}",
+ settings={"allow_insert_into_iceberg": 1},
+ )
+ wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED")
+
+ count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+ assert count == 2, f"Expected 2 rows after export, got {count}"
+
+ data_year = node.query(f"SELECT DISTINCT year FROM {iceberg_table}").strip()
+ assert data_year == "2024", f"Expected exported year '2024' (String), got {data_year!r}"
+
+ query_id = f"identity_xform_{uid}"
+ node.query(
+ f"SELECT * FROM {iceberg_table}",
+ query_id=query_id,
+ settings={"iceberg_metadata_log_level": "manifest_file_entry"},
+ )
+ entries = fetch_manifest_entries(node, query_id)
+ partitions = _data_file_partition_records(entries)
+ assert partitions, "No data-file partition records found in manifest entries"
+ meta_values = {str(_partition_scalar(p, "year")) for p in partitions}
+ assert meta_values == {"2024"}, (
+ f"Metadata partition {meta_values} must equal the destination String value "
+ f"'2024' (not the source integer representation)."
+ )
+
+
+def test_export_partition_multicolumn_identity_metadata_matches_data(cluster):
+ """A multi-column identity partition (event_date Date, retention UInt64 -> Int64)
+ records per-column values in the Iceberg metadata that match the exported data."""
+ node = cluster.instances["replica1"]
+
+ uid = unique_suffix()
+ mt_table = f"mt_multicol_{uid}"
+ iceberg_table = f"iceberg_multicol_{uid}"
+
+ # Iceberg has no unsigned types, so retention widens UInt64 -> Int64; the cast is
+ # not value-preserving per canBeSafelyCast, hence the lossy opt-in below.
+ make_rmt(node, mt_table, "id Int64, event_date Date, retention UInt64",
+ "(event_date, retention)", replica_name="replica1")
+ node.query(
+ f"INSERT INTO {mt_table} VALUES "
+ f"(1, '2024-03-05', 30), (2, '2024-03-05', 30), (3, '2024-03-05', 30)"
+ )
+
+ make_iceberg_s3(node, iceberg_table, "id Int64, event_date Date, retention Int64",
+ partition_by="(event_date, retention)")
+
+ pid = first_partition_id(node, mt_table)
+ node.query(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}",
+ settings={
+ "allow_insert_into_iceberg": 1,
+ "export_merge_tree_part_allow_lossy_cast": 1,
+ },
+ )
+ wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED")
+
+ count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+ assert count == 3, f"Expected 3 rows after export, got {count}"
+
+ data_retention = int(node.query(
+ f"SELECT DISTINCT retention FROM {iceberg_table}"
+ ).strip())
+ assert data_retention == 30, f"Expected exported retention 30, got {data_retention}"
+
+ days = int(node.query(
+ f"SELECT DISTINCT toInt64(event_date) FROM {iceberg_table}"
+ ).strip())
+
+ query_id = f"multicol_{uid}"
+ node.query(
+ f"SELECT * FROM {iceberg_table}",
+ query_id=query_id,
+ settings={"iceberg_metadata_log_level": "manifest_file_entry"},
+ )
+ entries = fetch_manifest_entries(node, query_id)
+ partitions = _data_file_partition_records(entries)
+ assert partitions, "No data-file partition records found in manifest entries"
+
+ meta_dates = {int(_partition_scalar(p, "event_date")) for p in partitions}
+ assert meta_dates == {days}, (
+ f"Metadata event_date {meta_dates} must equal days-since-epoch {days}."
+ )
+ meta_retentions = {int(_partition_scalar(p, "retention")) for p in partitions}
+ assert meta_retentions == {30}, (
+ f"Metadata retention {meta_retentions} must equal the exported value 30."
+ )
+
+ filtered = int(node.query(
+ f"SELECT count() FROM {iceberg_table} "
+ f"WHERE event_date = '2024-03-05' AND retention = 30"
+ ).strip())
+ assert filtered == 3, f"Partition-filtered read expected 3 rows, got {filtered}"
From 4a75d529c95b34897183bd4d110930ee97e73763 Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Tue, 16 Jun 2026 17:09:41 -0300
Subject: [PATCH 14/33] rmv task ttl
---
docs/en/antalya/partition_export.md | 9 +----
src/Core/Settings.cpp | 6 +--
...portReplicatedMergeTreePartitionManifest.h | 3 --
.../ExportPartitionManifestUpdatingTask.cpp | 37 +++++-------------
src/Storages/StorageReplicatedMergeTree.cpp | 33 ++--------------
.../test.py | 35 +++++++++--------
.../test.py | 39 ++++++++++---------
7 files changed, 53 insertions(+), 109 deletions(-)
diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md
index bb17a57f87cd..95a2cfe36fb4 100644
--- a/docs/en/antalya/partition_export.md
+++ b/docs/en/antalya/partition_export.md
@@ -59,7 +59,7 @@ TO TABLE [destination_database.]destination_table
- **Type**: `Bool`
- **Default**: `false`
-- **Description**: Ignore existing partition export and overwrite the ZooKeeper entry. Allows re-exporting a partition to the same destination before the manifest expires. **IMPORTANT:** this is dangerous because it can lead to duplicated data, use it with caution.
+- **Description**: Ignore existing partition export and overwrite the ZooKeeper entry. Allows re-exporting a partition that was already exported to the same destination. **IMPORTANT:** this is dangerous because it can lead to duplicated data, use it with caution.
#### `export_merge_tree_partition_max_retries` (Optional)
@@ -67,12 +67,6 @@ TO TABLE [destination_database.]destination_table
- **Default**: `3`
- **Description**: Maximum number of retries for exporting a merge tree part in an export partition task. If it exceeds, the entire task fails.
-#### `export_merge_tree_partition_manifest_ttl` (Optional)
-
-- **Type**: `UInt64`
-- **Default**: `180` (seconds)
-- **Description**: Determines how long the manifest will live in ZooKeeper. It prevents the same partition from being exported twice to the same destination. This setting does not affect or delete in-progress tasks; it only cleans up completed ones.
-
#### `export_merge_tree_part_file_already_exists_policy` (Optional)
- **Type**: `MergeTreePartExportFileAlreadyExistsPolicy`
@@ -109,7 +103,6 @@ When the timeout is exceeded the task transitions to KILLED (same terminal state
Notes:
- Enforcement is best-effort: actual kill latency is bounded by one manifest-updater poll cycle (~30s) plus ZooKeeper watch propagation.
-- Since both this timeout and `export_merge_tree_partition_manifest_ttl` are measured from `create_time`, keep `export_merge_tree_partition_manifest_ttl` greater than `export_merge_tree_partition_task_timeout_seconds` if you want the KILLED entry to remain visible in `system.replicated_partition_exports` after the timeout fires.
### `export_merge_tree_part_allow_lossy_cast` (Optional)
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index d905075c5b71..7a9089880d96 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -7557,10 +7557,6 @@ Ignore existing partition export and overwrite the zookeeper entry
)", 0) \
DECLARE(UInt64, export_merge_tree_partition_max_retries, 3, R"(
Maximum number of retries for exporting a merge tree part in an export partition task
-)", 0) \
- DECLARE(UInt64, export_merge_tree_partition_manifest_ttl, 86400, R"(
-Determines how long the manifest will live in ZooKeeper. It prevents the same partition from being exported twice to the same destination.
-This setting does not affect / delete in progress tasks. It'll only cleanup the completed ones.
)", 0) \
DECLARE(UInt64, export_merge_tree_partition_task_timeout_seconds, 86400, R"(
Maximum wall-clock duration (in seconds) an export partition task is allowed to remain in the PENDING state before it is auto-killed by the background cleanup loop.
@@ -7573,7 +7569,6 @@ In such scenario, ClickHouse would attempt to commit those files again producing
Notes:
- Enforcement is best-effort: actual kill latency is bounded by one manifest-updater poll cycle (~30s) plus ZooKeeper watch propagation.
-- Since both this timeout and `export_merge_tree_partition_manifest_ttl` are measured from `create_time`, keep `export_merge_tree_partition_manifest_ttl` greater than `export_merge_tree_partition_task_timeout_seconds` if you want the KILLED entry to remain visible in `system.replicated_partition_exports` after the timeout fires.
)", 0) \
DECLARE(MergeTreePartExportFileAlreadyExistsPolicy, export_merge_tree_part_file_already_exists_policy, MergeTreePartExportFileAlreadyExistsPolicy::skip, R"(
Possible values:
@@ -7948,6 +7943,7 @@ Maximum number of WebAssembly UDF instances that can run in parallel per functio
#define OBSOLETE_SETTINGS(M, ALIAS) \
/** Obsolete settings which are kept around for compatibility reasons. They have no effect anymore. */ \
+ MAKE_OBSOLETE(M, UInt64, export_merge_tree_partition_manifest_ttl, 86400) \
MAKE_OBSOLETE(M, Bool, query_condition_cache_store_conditions_as_plaintext, false) \
MAKE_OBSOLETE(M, Bool, update_insert_deduplication_token_in_dependent_materialized_views, 0) \
MAKE_OBSOLETE(M, UInt64, max_memory_usage_for_all_queries, 0) \
diff --git a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
index db26836bdb2e..4e28e5e4f505 100644
--- a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
+++ b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
@@ -164,7 +164,6 @@ struct ExportReplicatedMergeTreePartitionManifest
std::vector parts;
time_t create_time;
size_t max_retries;
- size_t ttl_seconds;
size_t task_timeout_seconds;
size_t max_threads;
bool parallel_formatting;
@@ -206,7 +205,6 @@ struct ExportReplicatedMergeTreePartitionManifest
json.set("filename_pattern", filename_pattern);
json.set("create_time", create_time);
json.set("max_retries", max_retries);
- json.set("ttl_seconds", ttl_seconds);
json.set("task_timeout_seconds", task_timeout_seconds);
json.set("write_full_path_in_iceberg_metadata", write_full_path_in_iceberg_metadata);
json.set("allow_lossy_cast", allow_lossy_cast);
@@ -242,7 +240,6 @@ struct ExportReplicatedMergeTreePartitionManifest
manifest.parts.push_back(parts_array->getElement(static_cast(i)));
manifest.create_time = json->getValue("create_time");
- manifest.ttl_seconds = json->getValue("ttl_seconds");
manifest.task_timeout_seconds = json->getValue("task_timeout_seconds");
manifest.max_threads = json->getValue("max_threads");
manifest.parallel_formatting = json->getValue("parallel_formatting");
diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
index b45f3667be87..6926ee77953a 100644
--- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
+++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
@@ -128,20 +128,19 @@ namespace
}
/*
- Remove expired entries and fix non-committed exports that have already exported all parts.
+ Enforce the PENDING task timeout and recover non-committed exports that have already
+ exported all parts. Entries are never removed for age — `system.replicated_partition_exports`
+ is append-only history.
- Return values:
- - true: the cleanup was successful, the entry is removed from the entries_by_key container and the function returns true. Proceed to the next entry.
- - false: the cleanup was not successful, the entry is not removed from the entries_by_key container and the function returns false.
+ Always returns false: the entry stays in the in-memory container. A KILLED transition is
+ driven by the status watch, and a deferred commit is handled by the caller after the lock
+ is released.
Side outputs:
- `deferred_commits`: when a PENDING entry has all parts processed but the export was
never committed, this function appends a CommitRecoveryWork item to be executed by
the caller after releasing the storage-wide mutex. The actual commit() call (which
performs network I/O to the destination catalog and S3) MUST NOT run under the lock.
- The function still returns `false` in that case so the outer poll() loop falls through
- to `addTask`, keeping the in-memory entry consistent regardless of whether the
- deferred commit ultimately succeeds.
*/
bool tryCleanup(
const zkutil::ZooKeeperPtr & zk,
@@ -149,33 +148,17 @@ namespace
const LoggerPtr & log,
const ContextPtr & storage_context,
StorageReplicatedMergeTree & storage,
- const std::string & key,
const ExportReplicatedMergeTreePartitionManifest & metadata,
const time_t now,
const bool is_pending,
- auto & entries_by_key,
std::vector & deferred_commits
)
{
- bool has_expired = metadata.create_time < now - static_cast(metadata.ttl_seconds);
-
bool task_timed_out = is_pending
&& metadata.task_timeout_seconds > 0
&& metadata.create_time + static_cast(metadata.task_timeout_seconds) < now;
- if (has_expired && !is_pending)
- {
- zk->tryRemoveRecursive(fs::path(entry_path));
- ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests);
- ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRemoveRecursive);
- auto it = entries_by_key.find(key);
- if (it != entries_by_key.end())
- entries_by_key.erase(it);
- LOG_INFO(log, "ExportPartition Manifest Updating Task: Removed {}: expired", key);
-
- return true;
- }
- else if (task_timed_out)
+ if (task_timed_out)
{
const std::string status_path = fs::path(entry_path) / "status";
@@ -365,8 +348,8 @@ void ExportPartitionManifestUpdatingTask::poll()
const std::string cleanup_lock_path = fs::path(storage.zookeeper_path) / "exports_cleanup_lock";
/// The `exports_cleanup_lock` is an ephemeral ZK node that serializes cleanup work
- /// across replicas: only the replica holding it walks `tryCleanup` (entry expiry +
- /// commit recovery). It MUST outlive the deferred-commit loop below; otherwise a peer
+ /// across replicas: only the replica holding it walks `tryCleanup` (task-timeout
+ /// enforcement + commit recovery). It MUST outlive the deferred-commit loop below; otherwise a peer
/// replica's next poll() could acquire it and race us on the same commit-recovery work,
/// duplicating REST-catalog round-trips and snapshot writes. The EphemeralNodeHolder
/// destructor removes the node, so we declare it at function scope and let it die
@@ -478,11 +461,9 @@ void ExportPartitionManifestUpdatingTask::poll()
storage.log.load(),
storage.getContext(),
storage,
- key,
metadata,
now,
*status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING,
- entries_by_key,
deferred_commits);
if (cleanup_successful)
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 10eee4e5db95..046399f0bdfe 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -215,7 +215,6 @@ namespace Setting
extern const SettingsBool allow_experimental_export_merge_tree_part;
extern const SettingsBool export_merge_tree_partition_force_export;
extern const SettingsUInt64 export_merge_tree_partition_max_retries;
- extern const SettingsUInt64 export_merge_tree_partition_manifest_ttl;
extern const SettingsUInt64 export_merge_tree_partition_task_timeout_seconds;
extern const SettingsBool output_format_parallel_formatting;
extern const SettingsBool output_format_parquet_parallel_encoding;
@@ -8416,36 +8415,11 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperExists);
if (zookeeper->exists(partition_exports_path))
{
- LOG_INFO(log, "Export with key {} is already exported or it is being exported. Checking if it has expired so that we can overwrite it", export_key);
+ LOG_INFO(log, "Export with key {} is already exported or it is being exported", export_key);
- bool has_expired = false;
-
- ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests);
- ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperExists);
- if (zookeeper->exists(fs::path(partition_exports_path) / "metadata.json"))
- {
- std::string metadata_json;
- ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests);
- ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGet);
- if (zookeeper->tryGet(fs::path(partition_exports_path) / "metadata.json", metadata_json))
- {
- const auto manifest = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json);
-
- const auto now = time(nullptr);
- const auto expiration_time = manifest.create_time + manifest.ttl_seconds;
-
- LOG_INFO(log, "Export with key {} has expiration time {}, now is {}", export_key, expiration_time, now);
-
- if (static_cast(expiration_time) < now)
- {
- has_expired = true;
- }
- }
- }
-
- if (!has_expired && !query_context->getSettingsRef()[Setting::export_merge_tree_partition_force_export])
+ if (!query_context->getSettingsRef()[Setting::export_merge_tree_partition_force_export])
{
- throw Exception(ErrorCodes::EXPORT_PARTITION_ALREADY_EXPORTED, "Export with key {} already exported or it is being exported, and it has not expired. Set `export_merge_tree_partition_force_export` to overwrite it.", export_key);
+ throw Exception(ErrorCodes::EXPORT_PARTITION_ALREADY_EXPORTED, "Export with key {} already exported or it is being exported. Set `export_merge_tree_partition_force_export` to overwrite it.", export_key);
}
LOG_INFO(log, "Overwriting export with key {}", export_key);
@@ -8527,7 +8501,6 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
manifest.parts = part_names;
manifest.create_time = time(nullptr);
manifest.max_retries = query_context->getSettingsRef()[Setting::export_merge_tree_partition_max_retries];
- manifest.ttl_seconds = query_context->getSettingsRef()[Setting::export_merge_tree_partition_manifest_ttl];
manifest.task_timeout_seconds = query_context->getSettingsRef()[Setting::export_merge_tree_partition_task_timeout_seconds];
manifest.max_threads = query_context->getSettingsRef()[Setting::max_threads];
manifest.parallel_formatting = query_context->getSettingsRef()[Setting::output_format_parallel_formatting];
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 00cf5b5ff6f2..89ceeca47003 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -746,13 +746,12 @@ def test_partition_key_compatibility_check(cluster):
)
-def test_export_ttl(cluster):
+def test_export_entries_never_expire(cluster):
"""
- After a manifest TTL expires the same partition can be re-exported, and the
- new data is appended to (or replaces) what is in the Iceberg table.
+ Export entries never expire: a second export of the same partition is rejected
+ indefinitely unless export_merge_tree_partition_force_export is set.
"""
node = cluster.instances["replica1"]
- ttl_seconds = 3
uid = unique_suffix()
mt_table = f"mt_{uid}"
@@ -763,28 +762,31 @@ def test_export_ttl(cluster):
# First export.
node.query(
f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} "
- f"SETTINGS export_merge_tree_partition_manifest_ttl = {ttl_seconds}, allow_insert_into_iceberg = 1"
+ f"SETTINGS allow_insert_into_iceberg = 1"
)
+ wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
+
+ count_after_first = int(node.query(f"SELECT count() FROM {iceberg_table} WHERE year = 2020").strip())
+ assert count_after_first == 3, f"Expected 3 rows after first export, got {count_after_first}"
- # A second export before the TTL expires must be rejected.
+ # A second export is rejected, and stays rejected (the entry never expires).
error = node.query_and_get_error(
f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
settings={"allow_insert_into_iceberg": 1},
)
- assert "Export with key" in error, f"Expected duplicate-export error before TTL, got: {error}"
-
- wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
-
- count_after_first = int(node.query(f"SELECT count() FROM {iceberg_table} WHERE year = 2020").strip())
- assert count_after_first == 3, f"Expected 3 rows after first export, got {count_after_first}"
+ assert "Export with key" in error, f"Expected duplicate-export error, got: {error}"
- # Wait for the manifest TTL to expire.
- time.sleep(ttl_seconds * 2)
+ time.sleep(5)
+ error = node.query_and_get_error(
+ f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+ settings={"allow_insert_into_iceberg": 1},
+ )
+ assert "Export with key" in error, f"Entry must not expire, got: {error}"
- # Second export must be accepted now.
+ # Only force_export can overwrite the existing entry.
node.query(
f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
- settings={"allow_insert_into_iceberg": 1},
+ settings={"allow_insert_into_iceberg": 1, "export_merge_tree_partition_force_export": 1},
)
wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
@@ -888,7 +890,6 @@ def test_export_task_timeout_kills_stuck_pending_task(cluster):
f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}"
f" SETTINGS export_merge_tree_partition_task_timeout_seconds = 5,"
f" export_merge_tree_partition_max_retries = 1000000,"
- f" export_merge_tree_partition_manifest_ttl = 3600,"
f" allow_insert_into_iceberg = 1"
)
diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
index d403538999df..c4e38cbff276 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
@@ -627,40 +627,43 @@ def test_inject_short_living_failures(cluster):
assert int(exception_count.strip()) >= 1, "Expected at least one exception"
-def test_export_ttl(cluster):
+def test_export_entries_never_expire(cluster):
+ """Export entries are kept as history; a duplicate export is rejected unless forced."""
node = cluster.instances["replica1"]
postfix = str(uuid.uuid4()).replace("-", "_")
mt_table = f"export_ttl_mt_table_{postfix}"
s3_table = f"export_ttl_s3_table_{postfix}"
- expiration_time = 3
-
create_tables_and_insert_data(node, mt_table, s3_table, "replica1")
# start export
- node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_manifest_ttl={expiration_time};")
-
- # assert that I get an error when trying to export the same partition again, query_and_get_error
- error = node.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};")
- assert "Export with key" in error, "Expected error about expired export"
-
- # wait for the export to finish and for the manifest to expire
+ node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};")
wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED")
- time.sleep(expiration_time * 2)
- # assert that the export succeeded, check the commit file
assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '1\n', "Export did not succeed"
- # start export again
- node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table}")
+ # the entry never expires, so re-exporting the same partition is always rejected
+ error = node.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};")
+ assert "Export with key" in error, "Expected duplicate-export error"
- # wait for the export to finish
+ # the completed entry is still present in the history table after a while
+ time.sleep(5)
+ assert node.query(
+ f"""
+ SELECT status FROM system.replicated_partition_exports
+ WHERE source_table = '{mt_table}'
+ AND destination_table = '{s3_table}'
+ AND partition_id = '2020'
+ """
+ ) == "COMPLETED\n", "Export entry should remain in history and not expire"
+
+ # only force_export can overwrite the existing entry
+ node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_force_export = 1")
wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED")
- # assert that the export succeeded, check the commit file
- # there should be two commit files now, one for the first export and one for the second export
- assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '2\n', "Export did not succeed"
+ # there should be two commit files now, one for each export
+ assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '2\n', "Forced re-export did not succeed"
def test_export_partition_file_already_exists_policy(cluster):
From a676814106777b62d94c002e8754582da3ad4e71 Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Tue, 16 Jun 2026 17:18:32 -0300
Subject: [PATCH 15/33] cleanup
---
.../ExportPartitionManifestUpdatingTask.cpp | 43 +++++++-----------
.../test.py | 45 -------------------
.../test.py | 39 ----------------
3 files changed, 17 insertions(+), 110 deletions(-)
diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
index 6926ee77953a..8ba8d8fc64a0 100644
--- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
+++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
@@ -130,11 +130,9 @@ namespace
/*
Enforce the PENDING task timeout and recover non-committed exports that have already
exported all parts. Entries are never removed for age — `system.replicated_partition_exports`
- is append-only history.
-
- Always returns false: the entry stays in the in-memory container. A KILLED transition is
- driven by the status watch, and a deferred commit is handled by the caller after the lock
- is released.
+ is append-only history, so the entry always stays in the in-memory container: a KILLED
+ transition is driven by the status watch, and a deferred commit is handled by the caller
+ after the lock is released.
Side outputs:
- `deferred_commits`: when a PENDING entry has all parts processed but the export was
@@ -142,7 +140,7 @@ namespace
the caller after releasing the storage-wide mutex. The actual commit() call (which
performs network I/O to the destination catalog and S3) MUST NOT run under the lock.
*/
- bool tryCleanup(
+ void tryCleanup(
const zkutil::ZooKeeperPtr & zk,
const std::string & entry_path,
const LoggerPtr & log,
@@ -170,14 +168,14 @@ namespace
if (!zk->tryGet(status_path, status_string, &status_stat))
{
LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to read status for {} while enforcing task timeout, skipping", entry_path);
- return false;
+ return;
}
const auto current_status = magic_enum::enum_cast(status_string);
if (!current_status || *current_status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING)
{
LOG_INFO(log, "ExportPartition Manifest Updating Task: Task {} is not PENDING, can't set to KILLED, skipping", entry_path);
- return false;
+ return;
}
const auto timeout_message = fmt::format(
@@ -217,9 +215,9 @@ namespace
entry_path, rc);
}
- /// Return false so the entry remains in entries_by_key; the status watch will drive
+ /// The entry remains in entries_by_key; the status watch will drive
/// handleStatusChanges -> killExportPart on every replica, mirroring user-initiated KILL.
- return false;
+ return;
}
else if (is_pending)
{
@@ -232,7 +230,7 @@ namespace
{
LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to get parts in processing or pending, skipping");
- return false;
+ return;
}
if (parts_in_processing_or_pending.empty())
@@ -244,7 +242,7 @@ namespace
if (!destination_storage)
{
LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to reconstruct destination storage: {}, skipping", destination_storage_id.getNameForLogs());
- return false;
+ return;
}
/// A replica exported the last part but the commit never landed. Capture everything
@@ -253,22 +251,18 @@ namespace
/// MAX_TRANSACTION_RETRIES = 100 retries; holding the storage-wide mutex across
/// that work is what caused `system.replicated_partition_exports` to hang.
///
- /// Returning false here keeps the outer poll() loop on the normal path: it will
- /// call addTask() so the in-memory container reflects the PENDING entry. The
- /// status watch registered by poll() will transition the local entry to
- /// COMPLETED/FAILED once the deferred commit (or a peer's commit) updates
- /// /status in ZooKeeper.
+ /// The outer poll() loop stays on the normal path: it will call addTask() so the
+ /// in-memory container reflects the PENDING entry. The status watch registered by
+ /// poll() will transition the local entry to COMPLETED/FAILED once the deferred
+ /// commit (or a peer's commit) updates /status in ZooKeeper.
deferred_commits.push_back(CommitRecoveryWork{
.metadata = metadata,
.entry_path = entry_path,
.destination_storage = destination_storage,
.context = context,
});
- return false;
}
}
-
- return false;
}
}
@@ -451,11 +445,11 @@ void ExportPartitionManifestUpdatingTask::poll()
continue;
}
- /// if we have the cleanup lock, try to cleanup
- /// if we successfully cleaned it up, early exit
+ /// If we hold the cleanup lock, enforce the task timeout and recover uncommitted exports.
+ /// Entries are never removed here, so we always fall through to refresh / addTask below.
if (cleanup_lock)
{
- bool cleanup_successful = tryCleanup(
+ tryCleanup(
zk,
entry_path,
storage.log.load(),
@@ -465,9 +459,6 @@ void ExportPartitionManifestUpdatingTask::poll()
now,
*status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING,
deferred_commits);
-
- if (cleanup_successful)
- continue;
}
if (has_local_entry_and_is_up_to_date)
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 89ceeca47003..62a8a2196313 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -746,51 +746,6 @@ def test_partition_key_compatibility_check(cluster):
)
-def test_export_entries_never_expire(cluster):
- """
- Export entries never expire: a second export of the same partition is rejected
- indefinitely unless export_merge_tree_partition_force_export is set.
- """
- node = cluster.instances["replica1"]
-
- uid = unique_suffix()
- mt_table = f"mt_{uid}"
- iceberg_table = f"iceberg_{uid}"
-
- setup_tables(cluster, mt_table, iceberg_table, nodes=["replica1"])
-
- # First export.
- node.query(
- f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} "
- f"SETTINGS allow_insert_into_iceberg = 1"
- )
- wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
-
- count_after_first = int(node.query(f"SELECT count() FROM {iceberg_table} WHERE year = 2020").strip())
- assert count_after_first == 3, f"Expected 3 rows after first export, got {count_after_first}"
-
- # A second export is rejected, and stays rejected (the entry never expires).
- error = node.query_and_get_error(
- f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
- settings={"allow_insert_into_iceberg": 1},
- )
- assert "Export with key" in error, f"Expected duplicate-export error, got: {error}"
-
- time.sleep(5)
- error = node.query_and_get_error(
- f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
- settings={"allow_insert_into_iceberg": 1},
- )
- assert "Export with key" in error, f"Entry must not expire, got: {error}"
-
- # Only force_export can overwrite the existing entry.
- node.query(
- f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
- settings={"allow_insert_into_iceberg": 1, "export_merge_tree_partition_force_export": 1},
- )
- wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
-
-
def test_export_data_files_are_not_cleaned_up_on_commit_failure(cluster):
"""
Verify that the data files are not cleaned up on commit failure and the export is retried.
diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
index c4e38cbff276..8ad265a375ad 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
@@ -627,45 +627,6 @@ def test_inject_short_living_failures(cluster):
assert int(exception_count.strip()) >= 1, "Expected at least one exception"
-def test_export_entries_never_expire(cluster):
- """Export entries are kept as history; a duplicate export is rejected unless forced."""
- node = cluster.instances["replica1"]
-
- postfix = str(uuid.uuid4()).replace("-", "_")
- mt_table = f"export_ttl_mt_table_{postfix}"
- s3_table = f"export_ttl_s3_table_{postfix}"
-
- create_tables_and_insert_data(node, mt_table, s3_table, "replica1")
-
- # start export
- node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};")
- wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED")
-
- assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '1\n', "Export did not succeed"
-
- # the entry never expires, so re-exporting the same partition is always rejected
- error = node.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};")
- assert "Export with key" in error, "Expected duplicate-export error"
-
- # the completed entry is still present in the history table after a while
- time.sleep(5)
- assert node.query(
- f"""
- SELECT status FROM system.replicated_partition_exports
- WHERE source_table = '{mt_table}'
- AND destination_table = '{s3_table}'
- AND partition_id = '2020'
- """
- ) == "COMPLETED\n", "Export entry should remain in history and not expire"
-
- # only force_export can overwrite the existing entry
- node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_force_export = 1")
- wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED")
-
- # there should be two commit files now, one for each export
- assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '2\n', "Forced re-export did not succeed"
-
-
def test_export_partition_file_already_exists_policy(cluster):
node = cluster.instances["replica1"]
From 16dac3cf25c8c6616945cacb215da10dc1b03111 Mon Sep 17 00:00:00 2001
From: Arthur Passos
Date: Tue, 16 Jun 2026 20:57:30 -0300
Subject: [PATCH 16/33] missing piece
---
.../design/alter-table-export-part-partition.md | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/antalya/docs/design/alter-table-export-part-partition.md b/antalya/docs/design/alter-table-export-part-partition.md
index d513e4b80e41..8ebe237c634a 100644
--- a/antalya/docs/design/alter-table-export-part-partition.md
+++ b/antalya/docs/design/alter-table-export-part-partition.md
@@ -422,12 +422,13 @@ The following notes expand on expected behavior of commands.
every active part of partition `p` across all replicas that host
it; `system.replicated_partition_exports` converges to `COMPLETED`.
-4. Re-issuing the same `EXPORT PARTITION` within
- `export_merge_tree_partition_manifest_ttl` is a no-op (no
- duplicate files) unless `export_merge_tree_partition_force_export = 1`. This
- behavior avoids accidentally exporting the same data twice. Note, however
- that forcing the operation is dangerous if ClickHouse can't clean up the
- previous operation. In this case you'll potentially commit files twice.
+4. Re-issuing the same `EXPORT PARTITION` is rejected (no duplicate
+ files) unless `export_merge_tree_partition_force_export = 1`. Export
+ entries are kept indefinitely in `system.replicated_partition_exports`
+ as history; they never expire. This behavior avoids accidentally
+ exporting the same data twice. Note, however that forcing the
+ operation is dangerous if ClickHouse can't clean up the previous
+ operation. In this case you'll potentially commit files twice.
5. Killing an in-flight partition export via `KILL EXPORT PARTITION`
transitions status to `KILLED` and stops all replicas' contributions.
@@ -481,7 +482,6 @@ The following notes expand on expected behavior of commands.
| `export_merge_tree_part_filename_pattern` | query | `{part_name}_{checksum}` | `String` | both | Filename template; supports `{part_name}`, `{checksum}`, `{database}`, `{table}`, server macros. |
| `export_merge_tree_partition_force_export` | query | `false` | `Bool` | `EXPORT PARTITION` | Overwrite a live Keeper manifest for the same `(source, destination, partition_id)`. Dangerous — can produce duplicate data on the destination; use with caution. |
| `export_merge_tree_partition_max_retries` | query | `3` | `UInt64` | `EXPORT PARTITION` | Retry budget applied to both per-part export attempts and per-task commit attempts (Iceberg). The task fails terminally if commit retries alone exceed the budget. |
-| `export_merge_tree_partition_manifest_ttl` | query | `180` (seconds) | `UInt64` | `EXPORT PARTITION` | Live-manifest TTL; acts as the idempotency window. Does not interrupt in-flight tasks. Keep this greater than `export_merge_tree_partition_task_timeout_seconds` if you want the `KILLED` entry to remain visible in `system.replicated_partition_exports` after the timeout fires. |
| `export_merge_tree_partition_task_timeout_seconds` | query | `3600` (seconds) | `UInt64` (`0`=disable) | `EXPORT PARTITION` | Wall-clock cap for `PENDING` tasks; on expiry transitions to `KILLED` with a timeout reason. Measured from manifest `create_time`. Enforcement latency ≈ one manifest-updater poll cycle (~30s) plus Keeper watch propagation. |
| `export_merge_tree_partition_system_table_prefer_remote_information` | query | `false` | `Bool` | `EXPORT PARTITION` | When `true`, `system.replicated_partition_exports` fetches fresh state from Keeper (requires the `MULTI_READ` feature flag); when `false`, uses local cached state. **Default flipped from `true` to `false` in this release** — Keeper round-trips were more expensive than warranted for the typical observability workload. (See NOTE 2.)|
| `export_merge_tree_part_file_already_exists_policy` | query | `skip` | `skip` / `error` / `overwrite` | `EXPORT PARTITION` | Per-file policy during partition export. |
@@ -722,7 +722,8 @@ peak memory. Hot path is the Parquet encoder, which warrants a guard against reg
(`parts_to_do > 0`) rather than corrupt data. Acceptable but must be documented in the
upgrade notes.
- **Risk (object-storage cost / accidental large exports):** mitigated by the experimental
- gates (default off) and the manifest idempotency window.
+ gates (default off) and the duplicate-export rejection (an existing export key is refused
+ unless `export_merge_tree_partition_force_export` is set).
- **Risk (Iceberg catalog manifest retention):** if the catalog reaps old manifest files
with a retention window shorter than `export_merge_tree_partition_task_timeout_seconds`,
the rare "sole-node commits, crashes, recovers after reaper deleted the commit manifest"
From 978f9e172f5b9df86e529b017f6c5bff6802087d Mon Sep 17 00:00:00 2001
From: CarlosFelipeOR
Date: Thu, 18 Jun 2026 17:39:19 -0300
Subject: [PATCH 17/33] Skip store_data.py pre-hook in MasterCI to avoid Finish
Workflow template-size overflow
Signed-off-by: CarlosFelipeOR
---
ci/workflows/master.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ci/workflows/master.py b/ci/workflows/master.py
index cb67d223d13c..e6e9bbdeac88 100644
--- a/ci/workflows/master.py
+++ b/ci/workflows/master.py
@@ -101,7 +101,7 @@
enable_commit_status_on_failure=True,
enable_slack_feed=False,
pre_hooks=[
- "python3 ./ci/jobs/scripts/workflow_hooks/store_data.py",
+ # "python3 ./ci/jobs/scripts/workflow_hooks/store_data.py", # NOTE (carlosfelipeor): we don't use this in master CI
"python3 ./ci/jobs/scripts/workflow_hooks/version_log.py",
"python3 ./ci/jobs/scripts/workflow_hooks/parse_ci_tags.py",
# "python3 ./ci/jobs/scripts/workflow_hooks/merge_sync_pr.py", # NOTE (strtgbb): we don't do this
From 03227749150845333ad362aca0961a5bf08c4b5b Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Fri, 27 Mar 2026 13:41:38 -0400
Subject: [PATCH 18/33] Add 'PRs in Release' table to report
---
.../ci_run_report.html.jinja | 9 +
.../create_workflow_report.py | 311 +++++++++++++++---
2 files changed, 279 insertions(+), 41 deletions(-)
diff --git a/.github/actions/create_workflow_report/ci_run_report.html.jinja b/.github/actions/create_workflow_report/ci_run_report.html.jinja
index 4c94465a16c6..a6e0df17c49f 100644
--- a/.github/actions/create_workflow_report/ci_run_report.html.jinja
+++ b/.github/actions/create_workflow_report/ci_run_report.html.jinja
@@ -168,6 +168,7 @@
{% endif %}
+ Fail reason conventions:
+ KNOWN - Accepted fail and fix is not planned
+ INVESTIGATE - We don't know why it fails
+ NEEDSFIX - Investigation done and a fix is needed to make it pass
+
+ {{ checks_known_fails_html }}
+
-
Checks Known Fails
-
- Fail reason conventions:
- KNOWN - Accepted fail and fix is not planned
- INVESTIGATE - We don't know why it fails
- NEEDSFIX - Investigation done and a fix is needed to make it pass
-