From 5ec03a9700f77ba6dd465e84b4775d4a7e4323e9 Mon Sep 17 00:00:00 2001
From: "Daniel Q. Kim" <daniel.kim@altinity.com>
Date: Wed, 20 May 2026 09:24:50 +0200
Subject: [PATCH 01/33] Fix Iceberg read optimization returning NULLs for
 stats-less manifests

When an Iceberg manifest's per-file column statistics are absent or
empty (common for non-Spark writers like pyiceberg with default
settings), DataFileMetaInfo::columns_info is empty. The optimization
in StorageObjectStorageSource::createReader misread this as "all
columns are absent from the file" and returned constant NULLs for
every row while still returning the correct row count. Result: silent
data loss on icebergLocal, icebergS3, icebergAzure, icebergHDFS, and
all *Cluster variants.

Gate the optimization's absent-NULL loop directly on
columns_info.empty() instead of introducing a separate stats-presence
flag. When no usable per-column stats were parsed -- whether the
manifest omitted the stats fields entirely or declared them but left
them empty -- fall through to the Parquet reader, which correctly
handles physically-present columns (read normally) and
schema-evolved-absent columns (handled by
IcebergMetadata::getInitialSchemaByPath setting the file's own schema
as initial_header). columns_info is already serialized to workers in
the cluster JSON path, so this changes no serialization format and
keeps the fork's DataFileMetaInfo serde identical to upstream.

Closes Altinity/ClickHouse#1545.
Mirror of Altinity/ClickHouse#1688 (antalya-25.8 fix).

Signed-off-by: Daniel Q. Kim <daniel.kim@altinity.com>
---
 .../StorageObjectStorageSource.cpp            |  59 +-
 ...t_iceberg_read_optimization_empty_stats.py | 685 ++++++++++++++++++
 2 files changed, 716 insertions(+), 28 deletions(-)
 create mode 100644 tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py

diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
index ee0cf191045c..a4a92ab58599 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@@ -886,40 +886,43 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
                     }
                 }
             }
-            for (const auto & column : requested_columns_list)
+            if (file_meta_data.has_value() && !file_meta_data.value()->columns_info.empty())
             {
-                const auto & column_name = column.first;
+                for (const auto & column : requested_columns_list)
+                {
+                    const auto & column_name = column.first;
 
-                if (file_meta_data.value()->columns_info.contains(column_name))
-                    continue;
+                    if (file_meta_data.value()->columns_info.contains(column_name))
+                        continue;
 
-                if (!column.second.second.type->isNullable())
-                    continue;
+                    if (!column.second.second.type->isNullable())
+                        continue;
 
-                /// With View over Iceberg table we have someting like 'materialize(time)' as column_name
-                /// Simple cheap check
-                if (column_name.starts_with("materialize(") && column_name.ends_with(")"))
-                    continue;
+                    /// With View over Iceberg table we have someting like 'materialize(time)' as column_name
+                    /// Simple cheap check
+                    if (column_name.starts_with("materialize(") && column_name.ends_with(")"))
+                        continue;
 
-                /// Skip columns produced by prewhere or row-level filter expressions —
-                /// they are computed at read time, not stored in the file.
-                if (format_filter_info
-                    && ((format_filter_info->prewhere_info && column_name == format_filter_info->prewhere_info->prewhere_column_name)
-                        || (format_filter_info->row_level_filter && column_name == format_filter_info->row_level_filter->column_name)))
-                    continue;
+                    /// Skip columns produced by prewhere or row-level filter expressions —
+                    /// they are computed at read time, not stored in the file.
+                    if (format_filter_info
+                        && ((format_filter_info->prewhere_info && column_name == format_filter_info->prewhere_info->prewhere_column_name)
+                            || (format_filter_info->row_level_filter && column_name == format_filter_info->row_level_filter->column_name)))
+                        continue;
 
-                /// Column is nullable and absent in file
-                constant_columns_with_values[column.second.first] =
-                    ConstColumnWithValue{
-                        column.second.second,
-                        Field()
-                    };
-                constant_columns.insert(column_name);
-
-                LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'",
-                    object_info->getPath(),
-                    column_name,
-                    column.second.second.type);
+                    /// Column is nullable and absent in file
+                    constant_columns_with_values[column.second.first] =
+                        ConstColumnWithValue{
+                            column.second.second,
+                            Field()
+                        };
+                    constant_columns.insert(column_name);
+
+                    LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'",
+                        object_info->getPath(),
+                        column_name,
+                        column.second.second.type);
+                }
             }
         }
 
diff --git a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
new file mode 100644
index 000000000000..34dc2fda2918
--- /dev/null
+++ b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
@@ -0,0 +1,685 @@
+#!/usr/bin/env python3
+"""
+Reproducer for Altinity/ClickHouse#1545:
+icebergLocal() returns all-NULL columns when allow_experimental_iceberg_read_optimization=1
+and the Iceberg manifest has no column statistics.
+
+Root cause
+----------
+The manifest Avro writer schema intentionally omits the optional stats fields
+(value_counts, column_sizes, null_value_counts, lower_bounds, upper_bounds).
+AvroForIcebergDeserializer.hasPath() therefore returns false for all three stats
+paths in ManifestFile.cpp.  columns_infos is left empty.
+
+StorageObjectStorageSource::createReader's second loop (the schema-evolution
+"absent column as NULL" path) iterates over every requested column.  Because
+columns_infos is empty, every nullable column is absent → each is injected as a
+constant NULL.  With all requested columns constant, need_only_count is set to
+true and the Parquet file is read in count-only mode; the output is the correct
+number of rows, but every value is NULL.
+
+Tests
+-----
+- test_iceberg_local_returns_actual_rows_with_stats_less_manifest
+  FAILS on unpatched code: returns all-NULL rows instead of real data.
+
+- test_iceberg_local_returns_correct_rows_when_optimization_disabled
+  PASSES on unpatched code: regression guard — optimization=0 bypasses the
+  buggy path and reads the Parquet file normally.
+"""
+
+import json
+import os
+import tempfile
+import time
+import uuid
+
+import avro.datafile
+import avro.io
+import avro.schema
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from helpers.iceberg_utils import get_uuid_str
+from helpers.s3_tools import LocalUploader
+
+
+# Iceberg v2 manifest list Avro schema (minimal fields needed by ClickHouse).
+_MANIFEST_LIST_SCHEMA_STR = json.dumps({
+    "type": "record",
+    "name": "manifest_file",
+    "fields": [
+        {"name": "manifest_path",        "type": "string"},
+        {"name": "manifest_length",      "type": "long"},
+        {"name": "partition_spec_id",    "type": "int"},
+        {"name": "content",              "type": "int"},
+        {"name": "sequence_number",      "type": "long"},
+        {"name": "min_sequence_number",  "type": "long"},
+        {"name": "added_snapshot_id",    "type": "long"},
+        {"name": "added_files_count",    "type": "int"},
+        {"name": "existing_files_count", "type": "int"},
+        {"name": "deleted_files_count",  "type": "int"},
+        {"name": "added_rows_count",     "type": "long"},
+        {"name": "existing_rows_count",  "type": "long"},
+        {"name": "deleted_rows_count",   "type": "long"},
+    ],
+})
+
+# Manifest entry Avro schema that deliberately omits all per-column stats fields:
+#   column_sizes, value_counts, null_value_counts, lower_bounds, upper_bounds.
+#
+# Absent fields → AvroForIcebergDeserializer.hasPath() returns false for each →
+# ManifestFile.cpp skips stats collection → columns_infos is empty →
+# the optimization treats every nullable column as a "schema-evolution absent column"
+# and injects a constant NULL.
+_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = json.dumps({
+    "type": "record",
+    "name": "manifest_entry",
+    "fields": [
+        {"name": "status",               "type": "int"},
+        {"name": "snapshot_id",          "type": ["null", "long"]},
+        {"name": "sequence_number",      "type": ["null", "long"]},
+        {"name": "file_sequence_number", "type": ["null", "long"]},
+        {
+            "name": "data_file",
+            "type": {
+                "type": "record",
+                "name": "r2",
+                "fields": [
+                    {"name": "content",            "type": "int"},
+                    {"name": "file_path",          "type": "string"},
+                    {"name": "file_format",        "type": "string"},
+                    {
+                        "name": "partition",
+                        "type": {"type": "record", "name": "r102", "fields": []},
+                    },
+                    {"name": "record_count",       "type": "long"},
+                    {"name": "file_size_in_bytes", "type": "long"},
+                ],
+            },
+        },
+    ],
+})
+
+
+def _write_avro(schema, records, path, metadata=None):
+    with open(path, "wb") as f:
+        writer = avro.datafile.DataFileWriter(f, avro.io.DatumWriter(), schema)
+        if metadata:
+            for k, v in metadata.items():
+                writer.set_meta(k, v if isinstance(v, bytes) else v.encode("utf-8"))
+        for rec in records:
+            writer.append(rec)
+        writer.close()
+
+
+def _create_stats_less_iceberg_table(tmpdir, table_name, container_base):
+    """
+    Build a minimal Iceberg v2 table under tmpdir/table_name.  The manifest is
+    written with a stats-less Avro schema to trigger the optimization bug.
+
+    container_base is the absolute path inside the ClickHouse container where the
+    table will be placed after upload.  File paths embedded in the Avro records and
+    metadata.json must reference container_base because they are interpreted at
+    query time inside the container.
+
+    Returns the local table directory path (tmpdir/table_name).
+    """
+    table_local = os.path.join(tmpdir, table_name)
+    data_local  = os.path.join(table_local, "data")
+    meta_local  = os.path.join(table_local, "metadata")
+    os.makedirs(data_local)
+    os.makedirs(meta_local)
+
+    # Parquet data file with two nullable columns.
+    arrow_schema = pa.schema([
+        pa.field("id",   pa.int32(),  nullable=True),
+        pa.field("data", pa.string(), nullable=True),
+    ])
+    pq.write_table(
+        pa.table(
+            {
+                "id":   pa.array([1, 2, 3], type=pa.int32()),
+                "data": pa.array(["hello", "world", "iceberg"], type=pa.string()),
+            },
+            schema=arrow_schema,
+        ),
+        os.path.join(data_local, "00000-0-data.parquet"),
+    )
+    data_size           = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet"))
+    data_container_path = f"{container_base}/data/00000-0-data.parquet"
+
+    snapshot_id = 1
+    seq_number  = 1
+    ts_ms       = int(time.time() * 1000)
+
+    # Iceberg schema and partition-spec to embed as Avro file-level metadata.
+    # ManifestFile.cpp requires both keys to be present in the manifest's Avro header.
+    iceberg_schema = {
+        "type": "struct",
+        "schema-id": 0,
+        "fields": [
+            {"id": 1, "name": "id",   "required": False, "type": "int"},
+            {"id": 2, "name": "data", "required": False, "type": "string"},
+        ],
+    }
+
+    # Manifest file (stats-less Avro schema).
+    manifest_local_path     = os.path.join(meta_local, "00000-0-manifest.avro")
+    manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro"
+    _write_avro(
+        avro.schema.parse(_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR),
+        [{
+            "status":               1,             # ADDED
+            "snapshot_id":          snapshot_id,
+            "sequence_number":      seq_number,
+            "file_sequence_number": seq_number,
+            "data_file": {
+                "content":            0,            # DATA
+                "file_path":          data_container_path,
+                "file_format":        "PARQUET",
+                "partition":          {},
+                "record_count":       3,
+                "file_size_in_bytes": data_size,
+            },
+        }],
+        manifest_local_path,
+        metadata={
+            "schema":         json.dumps(iceberg_schema),
+            "partition-spec": "[]",
+        },
+    )
+    manifest_size = os.path.getsize(manifest_local_path)
+
+    # Manifest list.
+    mlist_filename      = f"snap-{snapshot_id}-0-manifest-list.avro"
+    mlist_local_path    = os.path.join(meta_local, mlist_filename)
+    mlist_container_path = f"{container_base}/metadata/{mlist_filename}"
+    _write_avro(
+        avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR),
+        [{
+            "manifest_path":        manifest_container_path,
+            "manifest_length":      manifest_size,
+            "partition_spec_id":    0,
+            "content":              0,             # DATA
+            "sequence_number":      seq_number,
+            "min_sequence_number":  seq_number,
+            "added_snapshot_id":    snapshot_id,
+            "added_files_count":    1,
+            "existing_files_count": 0,
+            "deleted_files_count":  0,
+            "added_rows_count":     3,
+            "existing_rows_count":  0,
+            "deleted_rows_count":   0,
+        }],
+        mlist_local_path,
+    )
+
+    # Table metadata JSON.
+    metadata = {
+        "format-version":       2,
+        "table-uuid":           str(uuid.uuid4()),
+        "location":             container_base,
+        "last-sequence-number": seq_number,
+        "last-updated-ms":      ts_ms,
+        "last-column-id":       2,
+        "current-schema-id":    0,
+        "schemas": [{
+            "type":      "struct",
+            "schema-id": 0,
+            "fields": [
+                {"id": 1, "name": "id",   "required": False, "type": "int"},
+                {"id": 2, "name": "data", "required": False, "type": "string"},
+            ],
+        }],
+        "default-spec-id":       0,
+        "partition-specs":       [{"spec-id": 0, "fields": []}],
+        "last-partition-id":     999,
+        "default-sort-order-id": 0,
+        "sort-orders":           [{"order-id": 0, "fields": []}],
+        "properties":            {},
+        "current-snapshot-id":   snapshot_id,
+        "snapshots": [{
+            "snapshot-id":    snapshot_id,
+            "sequence-number": seq_number,
+            "timestamp-ms":   ts_ms,
+            "manifest-list":  mlist_container_path,
+            "summary":        {"operation": "append"},
+            "schema-id":      0,
+        }],
+        "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}],
+        "metadata-log":  [],
+        "refs":          {"main": {"snapshot-id": snapshot_id, "type": "branch"}},
+    }
+    with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f:
+        json.dump(metadata, f, indent=2)
+
+    return table_local
+
+
+def _upload_to_container(instance, local_table_dir, container_base):
+    uploader = LocalUploader(instance)
+    for root, _dirs, files in os.walk(local_table_dir):
+        for fname in files:
+            local_path = os.path.join(root, fname)
+            rel        = os.path.relpath(local_path, local_table_dir)
+            uploader.upload_file(local_path, os.path.join(container_base, rel))
+
+
+# Manifest entry schema with value_counts only (partial stats).
+# column_sizes and null_value_counts are absent.  value_counts alone is enough to
+# populate columns_info so the absent-NULL guard (columns_info.empty()) is not triggered.
+_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR = json.dumps({
+    "type": "record",
+    "name": "manifest_entry",
+    "fields": [
+        {"name": "status",               "type": "int"},
+        {"name": "snapshot_id",          "type": ["null", "long"]},
+        {"name": "sequence_number",      "type": ["null", "long"]},
+        {"name": "file_sequence_number", "type": ["null", "long"]},
+        {
+            "name": "data_file",
+            "type": {
+                "type": "record",
+                "name": "r2",
+                "fields": [
+                    {"name": "content",            "type": "int"},
+                    {"name": "file_path",          "type": "string"},
+                    {"name": "file_format",        "type": "string"},
+                    {
+                        "name": "partition",
+                        "type": {"type": "record", "name": "r102", "fields": []},
+                    },
+                    {"name": "record_count",       "type": "long"},
+                    {"name": "file_size_in_bytes", "type": "long"},
+                    # Only value_counts is present (column_sizes and null_value_counts absent).
+                    {
+                        "name": "value_counts",
+                        "type": {
+                            "type": "array",
+                            "items": {
+                                "type": "record",
+                                "name": "k81v81",
+                                "fields": [
+                                    {"name": "key",   "type": "int"},
+                                    {"name": "value", "type": "long"},
+                                ],
+                            },
+                        },
+                    },
+                ],
+            },
+        },
+    ],
+})
+
+# Manifest entry schema with all three stats fields (value_counts, column_sizes,
+# null_value_counts).  Represents a manifest written by a full-stats writer such as Spark.
+_MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR = json.dumps({
+    "type": "record",
+    "name": "manifest_entry",
+    "fields": [
+        {"name": "status",               "type": "int"},
+        {"name": "snapshot_id",          "type": ["null", "long"]},
+        {"name": "sequence_number",      "type": ["null", "long"]},
+        {"name": "file_sequence_number", "type": ["null", "long"]},
+        {
+            "name": "data_file",
+            "type": {
+                "type": "record",
+                "name": "r2",
+                "fields": [
+                    {"name": "content",            "type": "int"},
+                    {"name": "file_path",          "type": "string"},
+                    {"name": "file_format",        "type": "string"},
+                    {
+                        "name": "partition",
+                        "type": {"type": "record", "name": "r102", "fields": []},
+                    },
+                    {"name": "record_count",       "type": "long"},
+                    {"name": "file_size_in_bytes", "type": "long"},
+                    {
+                        "name": "column_sizes",
+                        "type": {
+                            "type": "array",
+                            "items": {
+                                "type": "record",
+                                "name": "k81v81",
+                                "fields": [
+                                    {"name": "key",   "type": "int"},
+                                    {"name": "value", "type": "long"},
+                                ],
+                            },
+                        },
+                    },
+                    {
+                        "name": "value_counts",
+                        "type": {
+                            "type": "array",
+                            "items": {
+                                "type": "record",
+                                "name": "k81v81_vc",
+                                "fields": [
+                                    {"name": "key",   "type": "int"},
+                                    {"name": "value", "type": "long"},
+                                ],
+                            },
+                        },
+                    },
+                    {
+                        "name": "null_value_counts",
+                        "type": {
+                            "type": "array",
+                            "items": {
+                                "type": "record",
+                                "name": "k81v81_nc",
+                                "fields": [
+                                    {"name": "key",   "type": "int"},
+                                    {"name": "value", "type": "long"},
+                                ],
+                            },
+                        },
+                    },
+                ],
+            },
+        },
+    ],
+})
+
+
+def _create_iceberg_table_with_schema(tmpdir, table_name, container_base,
+                                      manifest_schema_str, manifest_extra_fields=None):
+    """
+    Build a minimal Iceberg v2 table.  manifest_extra_fields is added to each
+    manifest entry's data_file record when using schemas that include stats.
+    """
+    table_local = os.path.join(tmpdir, table_name)
+    data_local  = os.path.join(table_local, "data")
+    meta_local  = os.path.join(table_local, "metadata")
+    os.makedirs(data_local)
+    os.makedirs(meta_local)
+
+    arrow_schema = pa.schema([
+        pa.field("id",   pa.int32(),  nullable=True),
+        pa.field("data", pa.string(), nullable=True),
+    ])
+    pq.write_table(
+        pa.table(
+            {
+                "id":   pa.array([1, 2, 3], type=pa.int32()),
+                "data": pa.array(["hello", "world", "iceberg"], type=pa.string()),
+            },
+            schema=arrow_schema,
+        ),
+        os.path.join(data_local, "00000-0-data.parquet"),
+    )
+    data_size           = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet"))
+    data_container_path = f"{container_base}/data/00000-0-data.parquet"
+
+    snapshot_id = 1
+    seq_number  = 1
+    ts_ms       = int(time.time() * 1000)
+
+    iceberg_schema = {
+        "type": "struct",
+        "schema-id": 0,
+        "fields": [
+            {"id": 1, "name": "id",   "required": False, "type": "int"},
+            {"id": 2, "name": "data", "required": False, "type": "string"},
+        ],
+    }
+
+    data_file_record = {
+        "content":            0,
+        "file_path":          data_container_path,
+        "file_format":        "PARQUET",
+        "partition":          {},
+        "record_count":       3,
+        "file_size_in_bytes": data_size,
+    }
+    if manifest_extra_fields:
+        data_file_record.update(manifest_extra_fields)
+
+    manifest_local_path     = os.path.join(meta_local, "00000-0-manifest.avro")
+    manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro"
+    _write_avro(
+        avro.schema.parse(manifest_schema_str),
+        [{
+            "status":               1,
+            "snapshot_id":          snapshot_id,
+            "sequence_number":      seq_number,
+            "file_sequence_number": seq_number,
+            "data_file":            data_file_record,
+        }],
+        manifest_local_path,
+        metadata={
+            "schema":         json.dumps(iceberg_schema),
+            "partition-spec": "[]",
+        },
+    )
+    manifest_size = os.path.getsize(manifest_local_path)
+
+    mlist_filename       = f"snap-{snapshot_id}-0-manifest-list.avro"
+    mlist_local_path     = os.path.join(meta_local, mlist_filename)
+    mlist_container_path = f"{container_base}/metadata/{mlist_filename}"
+    _write_avro(
+        avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR),
+        [{
+            "manifest_path":        manifest_container_path,
+            "manifest_length":      manifest_size,
+            "partition_spec_id":    0,
+            "content":              0,
+            "sequence_number":      seq_number,
+            "min_sequence_number":  seq_number,
+            "added_snapshot_id":    snapshot_id,
+            "added_files_count":    1,
+            "existing_files_count": 0,
+            "deleted_files_count":  0,
+            "added_rows_count":     3,
+            "existing_rows_count":  0,
+            "deleted_rows_count":   0,
+        }],
+        mlist_local_path,
+    )
+
+    metadata = {
+        "format-version":       2,
+        "table-uuid":           str(uuid.uuid4()),
+        "location":             container_base,
+        "last-sequence-number": seq_number,
+        "last-updated-ms":      ts_ms,
+        "last-column-id":       2,
+        "current-schema-id":    0,
+        "schemas": [{
+            "type":      "struct",
+            "schema-id": 0,
+            "fields": [
+                {"id": 1, "name": "id",   "required": False, "type": "int"},
+                {"id": 2, "name": "data", "required": False, "type": "string"},
+            ],
+        }],
+        "default-spec-id":       0,
+        "partition-specs":       [{"spec-id": 0, "fields": []}],
+        "last-partition-id":     999,
+        "default-sort-order-id": 0,
+        "sort-orders":           [{"order-id": 0, "fields": []}],
+        "properties":            {},
+        "current-snapshot-id":   snapshot_id,
+        "snapshots": [{
+            "snapshot-id":    snapshot_id,
+            "sequence-number": seq_number,
+            "timestamp-ms":   ts_ms,
+            "manifest-list":  mlist_container_path,
+            "summary":        {"operation": "append"},
+            "schema-id":      0,
+        }],
+        "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}],
+        "metadata-log":  [],
+        "refs":          {"main": {"snapshot-id": snapshot_id, "type": "branch"}},
+    }
+    with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f:
+        json.dump(metadata, f, indent=2)
+
+    return table_local
+
+
+def test_iceberg_local_returns_actual_rows_with_stats_less_manifest(
+    started_cluster_iceberg_no_spark,
+):
+    """
+    FAILS on unpatched code (Altinity#1545).
+
+    With allow_experimental_iceberg_read_optimization=1 (the default), reading an
+    icebergLocal table whose manifest omits column statistics returns all-NULL rows
+    instead of real data.  The correct output is 3 rows: (1,'hello'), (2,'world'),
+    (3,'iceberg').
+    """
+    instance   = started_cluster_iceberg_no_spark.instances["node1"]
+    table_name = "test_opt_on_stats_less_" + get_uuid_str()
+    container_base = (
+        f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
+    )
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base)
+        _upload_to_container(instance, local_dir, container_base)
+
+    result = instance.query(
+        f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+        " ORDER BY id"
+        " SETTINGS allow_experimental_iceberg_read_optimization=1"
+    ).strip()
+
+    assert result == "1\thello\n2\tworld\n3\ticeberg", (
+        f"Got: {result!r}\n"
+        "All-NULL rows means the bug fired: empty columns_infos caused the "
+        "optimization to treat every nullable column as an absent schema-evolution "
+        "column and inject constant NULL (need_only_count=1)."
+    )
+
+
+def test_iceberg_local_returns_correct_rows_when_optimization_disabled(
+    started_cluster_iceberg_no_spark,
+):
+    """
+    PASSES on unpatched code.
+
+    With allow_experimental_iceberg_read_optimization=0 the optimization block is
+    skipped entirely and the Parquet file is read normally.  This test is a
+    regression guard: it must keep passing after the bug is fixed.
+    """
+    instance   = started_cluster_iceberg_no_spark.instances["node1"]
+    table_name = "test_opt_off_stats_less_" + get_uuid_str()
+    container_base = (
+        f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
+    )
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base)
+        _upload_to_container(instance, local_dir, container_base)
+
+    result = instance.query(
+        f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+        " ORDER BY id"
+        " SETTINGS allow_experimental_iceberg_read_optimization=0"
+    ).strip()
+
+    assert result == "1\thello\n2\tworld\n3\ticeberg"
+
+
+def test_iceberg_local_partial_stats_manifest_reads_correctly(
+    started_cluster_iceberg_no_spark,
+):
+    """
+    Regression test for Altinity#1545: partial stats (value_counts only).
+
+    A manifest that includes value_counts but omits column_sizes and
+    null_value_counts still populates columns_info.  The fix must treat the
+    columns as real (not absent), so real data is returned.
+    """
+    instance   = started_cluster_iceberg_no_spark.instances["node1"]
+    table_name = "test_opt_on_partial_stats_" + get_uuid_str()
+    container_base = (
+        f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
+    )
+
+    # value_counts for field_id 1 (id) and 2 (data): 3 rows each.
+    extra = {
+        "value_counts": [
+            {"key": 1, "value": 3},
+            {"key": 2, "value": 3},
+        ],
+    }
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        local_dir = _create_iceberg_table_with_schema(
+            tmpdir, table_name, container_base,
+            _MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR,
+            manifest_extra_fields=extra,
+        )
+        _upload_to_container(instance, local_dir, container_base)
+
+    result = instance.query(
+        f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+        " ORDER BY id"
+        " SETTINGS allow_experimental_iceberg_read_optimization=1"
+    ).strip()
+
+    assert result == "1\thello\n2\tworld\n3\ticeberg", (
+        f"Got: {result!r}\n"
+        "Partial-stats manifest (value_counts only) should not trigger the absent-NULL "
+        "path because columns_info is non-empty.  All-NULL rows means the guard "
+        "fired incorrectly for partial-stats manifests."
+    )
+
+
+def test_iceberg_local_full_stats_manifest_reads_correctly(
+    started_cluster_iceberg_no_spark,
+):
+    """
+    Control test: full-stats manifest (Spark-like) must continue to return real data.
+
+    A manifest with all three stats fields (column_sizes, value_counts,
+    null_value_counts) is the common case written by Spark.  This test verifies the
+    optimization does not regress for the normal path after the fix.
+    """
+    instance   = started_cluster_iceberg_no_spark.instances["node1"]
+    table_name = "test_opt_on_full_stats_" + get_uuid_str()
+    container_base = (
+        f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
+    )
+
+    # Full stats for field_id 1 (id) and 2 (data).
+    extra = {
+        "column_sizes": [
+            {"key": 1, "value": 64},
+            {"key": 2, "value": 128},
+        ],
+        "value_counts": [
+            {"key": 1, "value": 3},
+            {"key": 2, "value": 3},
+        ],
+        "null_value_counts": [
+            {"key": 1, "value": 0},
+            {"key": 2, "value": 0},
+        ],
+    }
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        local_dir = _create_iceberg_table_with_schema(
+            tmpdir, table_name, container_base,
+            _MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR,
+            manifest_extra_fields=extra,
+        )
+        _upload_to_container(instance, local_dir, container_base)
+
+    result = instance.query(
+        f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+        " ORDER BY id"
+        " SETTINGS allow_experimental_iceberg_read_optimization=1"
+    ).strip()
+
+    assert result == "1\thello\n2\tworld\n3\ticeberg", (
+        f"Got: {result!r}\n"
+        "Full-stats manifest should return real data when optimization is enabled."
+    )

From 056eb1a3ec8926a0a1a9541cf767571b4e9219b5 Mon Sep 17 00:00:00 2001
From: "Daniel Q. Kim" <daniel.kim@altinity.com>
Date: Mon, 22 Jun 2026 14:55:28 +0200
Subject: [PATCH 02/33] Rework Iceberg empty-stats tests per review feedback

Address @zvonand's review of #1895: the tests were poorly scoped. The C++
fix (gate the optimization's absent-NULL loop on `!columns_info.empty()`) is
unchanged; only the integration tests are reworked.

- Drop the `allow_experimental_iceberg_read_optimization=0` test. The fix
  only changes behavior when the optimization is enabled, so the disabled
  path is unrelated to this change and belongs in a separate PR if at all.

- Add `test_stats_less_manifest_schema_evolution_absent_column`, a
  discriminating test where the Iceberg schema declares `id`+`data` but the
  Parquet file was written under the older schema and physically contains
  only `id`. On the fixed build the stats-less manifest falls through to the
  Parquet reader, reads `id` for real and synthesizes the absent `data`
  column as NULL via `IcebergMetadata::getInitialSchemaByPath` (the file's
  own schema becomes `initial_header`). On the unpatched build `id` is NULL
  too, so the result discriminates.

- Add `test_stats_less_manifest_cluster_returns_real_data`, exercising the
  cluster JSON serialization path with `icebergLocalCluster`. An empty
  `columns_info` must round-trip to the workers and still read real data,
  confirming the bug also affected the `*Cluster` variants.

- Add `test_non_empty_stats_absent_column_still_null`, a non-regression
  guard that the optimization's legitimate absent-NULL path (a column absent
  from a non-empty `columns_info`) keeps returning NULL while present columns
  read for real.

- Merge the partial-stats and full-stats cases into one parametrized
  `test_non_empty_stats_returns_real_data`.

- Keep the Iceberg fixtures generated in code rather than checked in as
  static files (zvonand's readability comment). Each manifest entry and the
  `metadata.json` embed per-test runtime values -- the UUID-suffixed
  container path, the byte sizes of the freshly written Parquet/Avro files
  and a random `table-uuid` -- so they are templates filled at run time, not
  static blobs. The one static part, the Avro schemas, is deduplicated into
  module constants, and a module-level comment explains the choice.

Verified on the fixed build: all 6 cases pass. With the guard reverted to
run the loop unconditionally, the three stats-less tests fail with all-NULL
rows while the two non-empty-stats guards pass.

Signed-off-by: Daniel Q. Kim <daniel.kim@altinity.com>
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 ...t_iceberg_read_optimization_empty_stats.py | 895 ++++++++----------
 1 file changed, 380 insertions(+), 515 deletions(-)

diff --git a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
index 34dc2fda2918..fe0c8e6d085f 100644
--- a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
+++ b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
@@ -1,31 +1,48 @@
 #!/usr/bin/env python3
 """
-Reproducer for Altinity/ClickHouse#1545:
-icebergLocal() returns all-NULL columns when allow_experimental_iceberg_read_optimization=1
-and the Iceberg manifest has no column statistics.
+Regression coverage for Altinity/ClickHouse#1545:
+the Iceberg read optimization (``allow_experimental_iceberg_read_optimization=1``)
+returned all-NULL columns when the manifest carried no per-file column statistics.
 
 Root cause
 ----------
-The manifest Avro writer schema intentionally omits the optional stats fields
-(value_counts, column_sizes, null_value_counts, lower_bounds, upper_bounds).
-AvroForIcebergDeserializer.hasPath() therefore returns false for all three stats
-paths in ManifestFile.cpp.  columns_infos is left empty.
-
-StorageObjectStorageSource::createReader's second loop (the schema-evolution
-"absent column as NULL" path) iterates over every requested column.  Because
-columns_infos is empty, every nullable column is absent → each is injected as a
-constant NULL.  With all requested columns constant, need_only_count is set to
-true and the Parquet file is read in count-only mode; the output is the correct
-number of rows, but every value is NULL.
-
-Tests
------
-- test_iceberg_local_returns_actual_rows_with_stats_less_manifest
-  FAILS on unpatched code: returns all-NULL rows instead of real data.
-
-- test_iceberg_local_returns_correct_rows_when_optimization_disabled
-  PASSES on unpatched code: regression guard — optimization=0 bypasses the
-  buggy path and reads the Parquet file normally.
+When a manifest's optional stats fields (``value_counts``, ``column_sizes``,
+``null_value_counts``, ``lower_bounds``, ``upper_bounds``) are all absent --
+common for non-Spark writers such as pyiceberg with default settings --
+``DataFileMetaInfo::columns_info`` is left empty.  The optimization's
+absent-column loop in ``StorageObjectStorageSource::createReader`` then iterated
+every requested column, found none in the empty ``columns_info`` map, and
+injected each nullable column as a constant ``NULL``.  With every requested
+column constant, ``need_only_count`` became true and the Parquet file was read
+in count-only mode: correct row count, every value ``NULL``.
+
+The fix gates that loop on ``!columns_info.empty()``; an empty map now falls
+through to the Parquet reader, which reads physically-present columns normally
+and synthesizes schema-evolved-absent columns as ``NULL`` via
+``IcebergMetadata::getInitialSchemaByPath``.
+
+Test taxonomy
+-------------
+Discriminating (fail on the unpatched build, pass on the fixed build):
+  - test_stats_less_manifest_returns_real_data
+  - test_stats_less_manifest_schema_evolution_absent_column
+Cluster/serialization path (the cluster JSON round-trip of an empty
+``columns_info``):
+  - test_stats_less_manifest_cluster_returns_real_data
+Non-regression guards (pass on both builds; the optimization's legitimate
+absent-NULL and present-column behavior must keep working when stats exist):
+  - test_non_empty_stats_absent_column_still_null
+  - test_non_empty_stats_returns_real_data
+
+Why the Iceberg fixtures are generated in code rather than checked in as static
+files: every manifest entry and the table ``metadata.json`` embed per-test
+runtime values -- the UUID-suffixed container path, the byte sizes of the
+freshly written Parquet and Avro files, and a random ``table-uuid`` -- so they
+are templates filled at run time, not the static blobs that a checked-in fixture
+(e.g. ``tests/queries/0_stateless/data_minio/.../v1.metadata.json``) would be.
+The one genuinely static part, the Avro schemas, is deduplicated into the
+module-level constants below; the stats-less manifest entry -- the artifact
+under test -- is necessarily produced in code.
 """
 
 import json
@@ -39,11 +56,20 @@
 import avro.schema
 import pyarrow as pa
 import pyarrow.parquet as pq
+import pytest
 
 from helpers.iceberg_utils import get_uuid_str
 from helpers.s3_tools import LocalUploader
 
 
+# The three rows every fixture stores; ``data`` is omitted from the Parquet file
+# when a test needs a column the file physically lacks.
+_IDS = [1, 2, 3]
+_DATA = ["hello", "world", "iceberg"]
+_REAL_DATA_RESULT = "1\thello\n2\tworld\n3\ticeberg"
+_DATA_NULL_RESULT = "1\t\\N\n2\t\\N\n3\t\\N"
+
+
 # Iceberg v2 manifest list Avro schema (minimal fields needed by ClickHouse).
 _MANIFEST_LIST_SCHEMA_STR = json.dumps({
     "type": "record",
@@ -65,46 +91,84 @@
     ],
 })
 
-# Manifest entry Avro schema that deliberately omits all per-column stats fields:
-#   column_sizes, value_counts, null_value_counts, lower_bounds, upper_bounds.
-#
-# Absent fields → AvroForIcebergDeserializer.hasPath() returns false for each →
-# ManifestFile.cpp skips stats collection → columns_infos is empty →
-# the optimization treats every nullable column as a "schema-evolution absent column"
-# and injects a constant NULL.
-_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = json.dumps({
-    "type": "record",
-    "name": "manifest_entry",
-    "fields": [
-        {"name": "status",               "type": "int"},
-        {"name": "snapshot_id",          "type": ["null", "long"]},
-        {"name": "sequence_number",      "type": ["null", "long"]},
-        {"name": "file_sequence_number", "type": ["null", "long"]},
-        {
-            "name": "data_file",
+
+def _data_file_fields(stats_fields):
+    """data_file record fields, optionally followed by the given stats arrays."""
+    fields = [
+        {"name": "content",            "type": "int"},
+        {"name": "file_path",          "type": "string"},
+        {"name": "file_format",        "type": "string"},
+        {"name": "partition", "type": {"type": "record", "name": "r102", "fields": []}},
+        {"name": "record_count",       "type": "long"},
+        {"name": "file_size_in_bytes", "type": "long"},
+    ]
+    for i, name in enumerate(stats_fields):
+        fields.append({
+            "name": name,
             "type": {
-                "type": "record",
-                "name": "r2",
-                "fields": [
-                    {"name": "content",            "type": "int"},
-                    {"name": "file_path",          "type": "string"},
-                    {"name": "file_format",        "type": "string"},
-                    {
-                        "name": "partition",
-                        "type": {"type": "record", "name": "r102", "fields": []},
-                    },
-                    {"name": "record_count",       "type": "long"},
-                    {"name": "file_size_in_bytes", "type": "long"},
-                ],
+                "type": "array",
+                "items": {
+                    "type": "record",
+                    "name": f"k81v81_{i}",
+                    "fields": [
+                        {"name": "key",   "type": "int"},
+                        {"name": "value", "type": "long"},
+                    ],
+                },
             },
-        },
-    ],
-})
+        })
+    return fields
 
 
-def _write_avro(schema, records, path, metadata=None):
+def _manifest_entry_schema(stats_fields):
+    """Iceberg v2 manifest entry Avro schema carrying the given stats arrays."""
+    return json.dumps({
+        "type": "record",
+        "name": "manifest_entry",
+        "fields": [
+            {"name": "status",               "type": "int"},
+            {"name": "snapshot_id",          "type": ["null", "long"]},
+            {"name": "sequence_number",      "type": ["null", "long"]},
+            {"name": "file_sequence_number", "type": ["null", "long"]},
+            {
+                "name": "data_file",
+                "type": {
+                    "type": "record",
+                    "name": "r2",
+                    "fields": _data_file_fields(stats_fields),
+                },
+            },
+        ],
+    })
+
+
+# Stats-less manifest: omits every optional stats field -> columns_info is empty.
+_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = _manifest_entry_schema([])
+# Partial stats: value_counts only (column_sizes and null_value_counts absent);
+# enough to populate columns_info so the empty-stats guard is not triggered.
+_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR = _manifest_entry_schema(["value_counts"])
+# Full stats: the common Spark-written shape.
+_MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR = _manifest_entry_schema(
+    ["column_sizes", "value_counts", "null_value_counts"]
+)
+
+
+def _schema_fields(with_data):
+    fields = [{"id": 1, "name": "id", "required": False, "type": "int"}]
+    if with_data:
+        fields.append({"id": 2, "name": "data", "required": False, "type": "string"})
+    return fields
+
+
+def _iceberg_schema(schema_id, with_data):
+    return {"type": "struct", "schema-id": schema_id, "fields": _schema_fields(with_data)}
+
+
+def _write_avro(schema_str, records, path, metadata=None):
     with open(path, "wb") as f:
-        writer = avro.datafile.DataFileWriter(f, avro.io.DatumWriter(), schema)
+        writer = avro.datafile.DataFileWriter(
+            f, avro.io.DatumWriter(), avro.schema.parse(schema_str)
+        )
         if metadata:
             for k, v in metadata.items():
                 writer.set_meta(k, v if isinstance(v, bytes) else v.encode("utf-8"))
@@ -113,17 +177,32 @@ def _write_avro(schema, records, path, metadata=None):
         writer.close()
 
 
-def _create_stats_less_iceberg_table(tmpdir, table_name, container_base):
+def _create_iceberg_table(
+    tmpdir,
+    table_name,
+    container_base,
+    *,
+    manifest_entry_schema_str,
+    file_has_data_column=True,
+    data_file_stats=None,
+    schema_evolution=False,
+):
     """
-    Build a minimal Iceberg v2 table under tmpdir/table_name.  The manifest is
-    written with a stats-less Avro schema to trigger the optimization bug.
+    Build a minimal Iceberg v2 table under ``tmpdir/table_name``.
 
     container_base is the absolute path inside the ClickHouse container where the
-    table will be placed after upload.  File paths embedded in the Avro records and
-    metadata.json must reference container_base because they are interpreted at
-    query time inside the container.
-
-    Returns the local table directory path (tmpdir/table_name).
+    table is placed after upload; file paths embedded in the Avro records and
+    metadata.json must reference it because they are interpreted at query time
+    inside the container.
+
+    file_has_data_column controls whether the Parquet file physically contains
+    the ``data`` column.  data_file_stats, when given, supplies the manifest
+    entry's stats arrays (e.g. ``value_counts``).  schema_evolution lays the
+    table out with two schemas (0: ``id`` only, 1: ``id``+``data``) and two
+    snapshots so the data file resolves to the older schema id, exercising the
+    Parquet-reader fall-through that synthesizes the absent ``data`` column.
+
+    Returns the local table directory path.
     """
     table_local = os.path.join(tmpdir, table_name)
     data_local  = os.path.join(table_local, "data")
@@ -131,125 +210,137 @@ def _create_stats_less_iceberg_table(tmpdir, table_name, container_base):
     os.makedirs(data_local)
     os.makedirs(meta_local)
 
-    # Parquet data file with two nullable columns.
-    arrow_schema = pa.schema([
-        pa.field("id",   pa.int32(),  nullable=True),
-        pa.field("data", pa.string(), nullable=True),
-    ])
+    arrow_fields = [pa.field("id", pa.int32(), nullable=True)]
+    table_data = {"id": pa.array(_IDS, type=pa.int32())}
+    if file_has_data_column:
+        arrow_fields.append(pa.field("data", pa.string(), nullable=True))
+        table_data["data"] = pa.array(_DATA, type=pa.string())
     pq.write_table(
-        pa.table(
-            {
-                "id":   pa.array([1, 2, 3], type=pa.int32()),
-                "data": pa.array(["hello", "world", "iceberg"], type=pa.string()),
-            },
-            schema=arrow_schema,
-        ),
+        pa.table(table_data, schema=pa.schema(arrow_fields)),
         os.path.join(data_local, "00000-0-data.parquet"),
     )
     data_size           = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet"))
     data_container_path = f"{container_base}/data/00000-0-data.parquet"
 
-    snapshot_id = 1
-    seq_number  = 1
-    ts_ms       = int(time.time() * 1000)
+    ts_ms = int(time.time() * 1000)
 
-    # Iceberg schema and partition-spec to embed as Avro file-level metadata.
-    # ManifestFile.cpp requires both keys to be present in the manifest's Avro header.
-    iceberg_schema = {
-        "type": "struct",
-        "schema-id": 0,
-        "fields": [
-            {"id": 1, "name": "id",   "required": False, "type": "int"},
-            {"id": 2, "name": "data", "required": False, "type": "string"},
-        ],
+    # The manifest entry's snapshot_id is the snapshot that wrote the data file.
+    # Under schema evolution that snapshot (id 1) carries the older schema (0),
+    # so the iterator resolves the file to schema 0 while reading the table at
+    # schema 1 -- the condition that makes getInitialSchemaByPath return the
+    # file's own schema as initial_header.
+    entry_snapshot_id = 1
+    write_schema_id   = 0  # schema the data file was written with
+
+    data_file_record = {
+        "content":            0,            # DATA
+        "file_path":          data_container_path,
+        "file_format":        "PARQUET",
+        "partition":          {},
+        "record_count":       len(_IDS),
+        "file_size_in_bytes": data_size,
     }
+    if data_file_stats:
+        data_file_record.update(data_file_stats)
 
-    # Manifest file (stats-less Avro schema).
     manifest_local_path     = os.path.join(meta_local, "00000-0-manifest.avro")
     manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro"
     _write_avro(
-        avro.schema.parse(_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR),
+        manifest_entry_schema_str,
         [{
             "status":               1,             # ADDED
-            "snapshot_id":          snapshot_id,
-            "sequence_number":      seq_number,
-            "file_sequence_number": seq_number,
-            "data_file": {
-                "content":            0,            # DATA
-                "file_path":          data_container_path,
-                "file_format":        "PARQUET",
-                "partition":          {},
-                "record_count":       3,
-                "file_size_in_bytes": data_size,
-            },
+            "snapshot_id":          entry_snapshot_id,
+            "sequence_number":      1,
+            "file_sequence_number": 1,
+            "data_file":            data_file_record,
         }],
         manifest_local_path,
         metadata={
-            "schema":         json.dumps(iceberg_schema),
+            # The manifest's own schema is the data file's write schema.
+            "schema":         json.dumps(_iceberg_schema(write_schema_id, file_has_data_column)),
             "partition-spec": "[]",
         },
     )
     manifest_size = os.path.getsize(manifest_local_path)
 
-    # Manifest list.
-    mlist_filename      = f"snap-{snapshot_id}-0-manifest-list.avro"
-    mlist_local_path    = os.path.join(meta_local, mlist_filename)
-    mlist_container_path = f"{container_base}/metadata/{mlist_filename}"
-    _write_avro(
-        avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR),
-        [{
-            "manifest_path":        manifest_container_path,
-            "manifest_length":      manifest_size,
-            "partition_spec_id":    0,
-            "content":              0,             # DATA
-            "sequence_number":      seq_number,
-            "min_sequence_number":  seq_number,
-            "added_snapshot_id":    snapshot_id,
-            "added_files_count":    1,
-            "existing_files_count": 0,
-            "deleted_files_count":  0,
-            "added_rows_count":     3,
-            "existing_rows_count":  0,
-            "deleted_rows_count":   0,
-        }],
-        mlist_local_path,
-    )
+    def _write_manifest_list(snapshot_id):
+        filename       = f"snap-{snapshot_id}-0-manifest-list.avro"
+        local_path     = os.path.join(meta_local, filename)
+        container_path = f"{container_base}/metadata/{filename}"
+        _write_avro(
+            _MANIFEST_LIST_SCHEMA_STR,
+            [{
+                "manifest_path":        manifest_container_path,
+                "manifest_length":      manifest_size,
+                "partition_spec_id":    0,
+                "content":              0,             # DATA
+                "sequence_number":      1,
+                "min_sequence_number":  1,
+                "added_snapshot_id":    entry_snapshot_id,
+                "added_files_count":    1,
+                "existing_files_count": 0,
+                "deleted_files_count":  0,
+                "added_rows_count":     len(_IDS),
+                "existing_rows_count":  0,
+                "deleted_rows_count":   0,
+            }],
+            local_path,
+        )
+        return container_path
+
+    if schema_evolution:
+        # Two schemas, two snapshots: the data file was written under schema 0
+        # (id only); the table later evolved to schema 1 (id + data).
+        schemas          = [_iceberg_schema(0, with_data=False), _iceberg_schema(1, with_data=True)]
+        current_schema   = 1
+        last_column_id   = 2
+        snapshots = [
+            {"snapshot-id": 1, "schema-id": 0, "manifest-list": _write_manifest_list(1)},
+            {"snapshot-id": 2, "schema-id": 1, "manifest-list": _write_manifest_list(2)},
+        ]
+        current_snapshot = 2
+        last_seq         = 2
+    else:
+        # Single schema (id + data); the data file resolves to it directly.
+        schemas          = [_iceberg_schema(0, with_data=True)]
+        current_schema   = 0
+        last_column_id   = 2
+        snapshots = [
+            {"snapshot-id": 1, "schema-id": 0, "manifest-list": _write_manifest_list(1)},
+        ]
+        current_snapshot = 1
+        last_seq         = 1
 
-    # Table metadata JSON.
     metadata = {
         "format-version":       2,
         "table-uuid":           str(uuid.uuid4()),
         "location":             container_base,
-        "last-sequence-number": seq_number,
+        "last-sequence-number": last_seq,
         "last-updated-ms":      ts_ms,
-        "last-column-id":       2,
-        "current-schema-id":    0,
-        "schemas": [{
-            "type":      "struct",
-            "schema-id": 0,
-            "fields": [
-                {"id": 1, "name": "id",   "required": False, "type": "int"},
-                {"id": 2, "name": "data", "required": False, "type": "string"},
-            ],
-        }],
+        "last-column-id":       last_column_id,
+        "current-schema-id":    current_schema,
+        "schemas":              schemas,
         "default-spec-id":       0,
         "partition-specs":       [{"spec-id": 0, "fields": []}],
         "last-partition-id":     999,
         "default-sort-order-id": 0,
         "sort-orders":           [{"order-id": 0, "fields": []}],
         "properties":            {},
-        "current-snapshot-id":   snapshot_id,
-        "snapshots": [{
-            "snapshot-id":    snapshot_id,
-            "sequence-number": seq_number,
-            "timestamp-ms":   ts_ms,
-            "manifest-list":  mlist_container_path,
-            "summary":        {"operation": "append"},
-            "schema-id":      0,
-        }],
-        "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}],
+        "current-snapshot-id":   current_snapshot,
+        "snapshots": [
+            {
+                "snapshot-id":     s["snapshot-id"],
+                "sequence-number": s["snapshot-id"],
+                "timestamp-ms":    ts_ms,
+                "manifest-list":   s["manifest-list"],
+                "summary":         {"operation": "append"},
+                "schema-id":       s["schema-id"],
+            }
+            for s in snapshots
+        ],
+        "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": s["snapshot-id"]} for s in snapshots],
         "metadata-log":  [],
-        "refs":          {"main": {"snapshot-id": snapshot_id, "type": "branch"}},
+        "refs":          {"main": {"snapshot-id": current_snapshot, "type": "branch"}},
     }
     with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f:
         json.dump(metadata, f, indent=2)
@@ -257,7 +348,7 @@ def _create_stats_less_iceberg_table(tmpdir, table_name, container_base):
     return table_local
 
 
-def _upload_to_container(instance, local_table_dir, container_base):
+def _upload_to_node(instance, local_table_dir, container_base):
     uploader = LocalUploader(instance)
     for root, _dirs, files in os.walk(local_table_dir):
         for fname in files:
@@ -266,420 +357,194 @@ def _upload_to_container(instance, local_table_dir, container_base):
             uploader.upload_file(local_path, os.path.join(container_base, rel))
 
 
-# Manifest entry schema with value_counts only (partial stats).
-# column_sizes and null_value_counts are absent.  value_counts alone is enough to
-# populate columns_info so the absent-NULL guard (columns_info.empty()) is not triggered.
-_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR = json.dumps({
-    "type": "record",
-    "name": "manifest_entry",
-    "fields": [
-        {"name": "status",               "type": "int"},
-        {"name": "snapshot_id",          "type": ["null", "long"]},
-        {"name": "sequence_number",      "type": ["null", "long"]},
-        {"name": "file_sequence_number", "type": ["null", "long"]},
-        {
-            "name": "data_file",
-            "type": {
-                "type": "record",
-                "name": "r2",
-                "fields": [
-                    {"name": "content",            "type": "int"},
-                    {"name": "file_path",          "type": "string"},
-                    {"name": "file_format",        "type": "string"},
-                    {
-                        "name": "partition",
-                        "type": {"type": "record", "name": "r102", "fields": []},
-                    },
-                    {"name": "record_count",       "type": "long"},
-                    {"name": "file_size_in_bytes", "type": "long"},
-                    # Only value_counts is present (column_sizes and null_value_counts absent).
-                    {
-                        "name": "value_counts",
-                        "type": {
-                            "type": "array",
-                            "items": {
-                                "type": "record",
-                                "name": "k81v81",
-                                "fields": [
-                                    {"name": "key",   "type": "int"},
-                                    {"name": "value", "type": "long"},
-                                ],
-                            },
-                        },
-                    },
-                ],
-            },
-        },
-    ],
-})
-
-# Manifest entry schema with all three stats fields (value_counts, column_sizes,
-# null_value_counts).  Represents a manifest written by a full-stats writer such as Spark.
-_MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR = json.dumps({
-    "type": "record",
-    "name": "manifest_entry",
-    "fields": [
-        {"name": "status",               "type": "int"},
-        {"name": "snapshot_id",          "type": ["null", "long"]},
-        {"name": "sequence_number",      "type": ["null", "long"]},
-        {"name": "file_sequence_number", "type": ["null", "long"]},
-        {
-            "name": "data_file",
-            "type": {
-                "type": "record",
-                "name": "r2",
-                "fields": [
-                    {"name": "content",            "type": "int"},
-                    {"name": "file_path",          "type": "string"},
-                    {"name": "file_format",        "type": "string"},
-                    {
-                        "name": "partition",
-                        "type": {"type": "record", "name": "r102", "fields": []},
-                    },
-                    {"name": "record_count",       "type": "long"},
-                    {"name": "file_size_in_bytes", "type": "long"},
-                    {
-                        "name": "column_sizes",
-                        "type": {
-                            "type": "array",
-                            "items": {
-                                "type": "record",
-                                "name": "k81v81",
-                                "fields": [
-                                    {"name": "key",   "type": "int"},
-                                    {"name": "value", "type": "long"},
-                                ],
-                            },
-                        },
-                    },
-                    {
-                        "name": "value_counts",
-                        "type": {
-                            "type": "array",
-                            "items": {
-                                "type": "record",
-                                "name": "k81v81_vc",
-                                "fields": [
-                                    {"name": "key",   "type": "int"},
-                                    {"name": "value", "type": "long"},
-                                ],
-                            },
-                        },
-                    },
-                    {
-                        "name": "null_value_counts",
-                        "type": {
-                            "type": "array",
-                            "items": {
-                                "type": "record",
-                                "name": "k81v81_nc",
-                                "fields": [
-                                    {"name": "key",   "type": "int"},
-                                    {"name": "value", "type": "long"},
-                                ],
-                            },
-                        },
-                    },
-                ],
-            },
-        },
-    ],
-})
+def _container_base(table_name):
+    return f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
 
 
-def _create_iceberg_table_with_schema(tmpdir, table_name, container_base,
-                                      manifest_schema_str, manifest_extra_fields=None):
-    """
-    Build a minimal Iceberg v2 table.  manifest_extra_fields is added to each
-    manifest entry's data_file record when using schemas that include stats.
-    """
-    table_local = os.path.join(tmpdir, table_name)
-    data_local  = os.path.join(table_local, "data")
-    meta_local  = os.path.join(table_local, "metadata")
-    os.makedirs(data_local)
-    os.makedirs(meta_local)
-
-    arrow_schema = pa.schema([
-        pa.field("id",   pa.int32(),  nullable=True),
-        pa.field("data", pa.string(), nullable=True),
-    ])
-    pq.write_table(
-        pa.table(
-            {
-                "id":   pa.array([1, 2, 3], type=pa.int32()),
-                "data": pa.array(["hello", "world", "iceberg"], type=pa.string()),
-            },
-            schema=arrow_schema,
-        ),
-        os.path.join(data_local, "00000-0-data.parquet"),
-    )
-    data_size           = os.path.getsize(os.path.join(data_local, "00000-0-data.parquet"))
-    data_container_path = f"{container_base}/data/00000-0-data.parquet"
+def _query_local(instance, container_base, optimization=1):
+    return instance.query(
+        f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+        " ORDER BY id"
+        f" SETTINGS allow_experimental_iceberg_read_optimization={optimization}"
+    ).strip()
 
-    snapshot_id = 1
-    seq_number  = 1
-    ts_ms       = int(time.time() * 1000)
 
-    iceberg_schema = {
-        "type": "struct",
-        "schema-id": 0,
-        "fields": [
-            {"id": 1, "name": "id",   "required": False, "type": "int"},
-            {"id": 2, "name": "data", "required": False, "type": "string"},
-        ],
-    }
+def test_stats_less_manifest_returns_real_data(started_cluster_iceberg_no_spark):
+    """
+    Discriminating reproducer for Altinity#1545.
 
-    data_file_record = {
-        "content":            0,
-        "file_path":          data_container_path,
-        "file_format":        "PARQUET",
-        "partition":          {},
-        "record_count":       3,
-        "file_size_in_bytes": data_size,
-    }
-    if manifest_extra_fields:
-        data_file_record.update(manifest_extra_fields)
+    A stats-less manifest (empty columns_info) must read real data, not all-NULL
+    rows.  Unpatched build: every nullable column is injected as a constant NULL
+    and the result is 3 all-NULL rows.  Fixed build: the empty-stats guard falls
+    through to the Parquet reader and returns the real values.
+    """
+    instance       = started_cluster_iceberg_no_spark.instances["node1"]
+    table_name     = "test_stats_less_real_" + get_uuid_str()
+    container_base = _container_base(table_name)
 
-    manifest_local_path     = os.path.join(meta_local, "00000-0-manifest.avro")
-    manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro"
-    _write_avro(
-        avro.schema.parse(manifest_schema_str),
-        [{
-            "status":               1,
-            "snapshot_id":          snapshot_id,
-            "sequence_number":      seq_number,
-            "file_sequence_number": seq_number,
-            "data_file":            data_file_record,
-        }],
-        manifest_local_path,
-        metadata={
-            "schema":         json.dumps(iceberg_schema),
-            "partition-spec": "[]",
-        },
-    )
-    manifest_size = os.path.getsize(manifest_local_path)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        local_dir = _create_iceberg_table(
+            tmpdir, table_name, container_base,
+            manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
+        )
+        _upload_to_node(instance, local_dir, container_base)
 
-    mlist_filename       = f"snap-{snapshot_id}-0-manifest-list.avro"
-    mlist_local_path     = os.path.join(meta_local, mlist_filename)
-    mlist_container_path = f"{container_base}/metadata/{mlist_filename}"
-    _write_avro(
-        avro.schema.parse(_MANIFEST_LIST_SCHEMA_STR),
-        [{
-            "manifest_path":        manifest_container_path,
-            "manifest_length":      manifest_size,
-            "partition_spec_id":    0,
-            "content":              0,
-            "sequence_number":      seq_number,
-            "min_sequence_number":  seq_number,
-            "added_snapshot_id":    snapshot_id,
-            "added_files_count":    1,
-            "existing_files_count": 0,
-            "deleted_files_count":  0,
-            "added_rows_count":     3,
-            "existing_rows_count":  0,
-            "deleted_rows_count":   0,
-        }],
-        mlist_local_path,
+    result = _query_local(instance, container_base)
+    assert result == _REAL_DATA_RESULT, (
+        f"Got: {result!r}\n"
+        "All-NULL rows mean the bug fired: empty columns_info caused the "
+        "optimization to treat every nullable column as absent and inject "
+        "constant NULL (need_only_count=1)."
     )
 
-    metadata = {
-        "format-version":       2,
-        "table-uuid":           str(uuid.uuid4()),
-        "location":             container_base,
-        "last-sequence-number": seq_number,
-        "last-updated-ms":      ts_ms,
-        "last-column-id":       2,
-        "current-schema-id":    0,
-        "schemas": [{
-            "type":      "struct",
-            "schema-id": 0,
-            "fields": [
-                {"id": 1, "name": "id",   "required": False, "type": "int"},
-                {"id": 2, "name": "data", "required": False, "type": "string"},
-            ],
-        }],
-        "default-spec-id":       0,
-        "partition-specs":       [{"spec-id": 0, "fields": []}],
-        "last-partition-id":     999,
-        "default-sort-order-id": 0,
-        "sort-orders":           [{"order-id": 0, "fields": []}],
-        "properties":            {},
-        "current-snapshot-id":   snapshot_id,
-        "snapshots": [{
-            "snapshot-id":    snapshot_id,
-            "sequence-number": seq_number,
-            "timestamp-ms":   ts_ms,
-            "manifest-list":  mlist_container_path,
-            "summary":        {"operation": "append"},
-            "schema-id":      0,
-        }],
-        "snapshot-log": [{"timestamp-ms": ts_ms, "snapshot-id": snapshot_id}],
-        "metadata-log":  [],
-        "refs":          {"main": {"snapshot-id": snapshot_id, "type": "branch"}},
-    }
-    with open(os.path.join(meta_local, "v1.metadata.json"), "w") as f:
-        json.dump(metadata, f, indent=2)
-
-    return table_local
-
 
-def test_iceberg_local_returns_actual_rows_with_stats_less_manifest(
-    started_cluster_iceberg_no_spark,
-):
+def test_stats_less_manifest_schema_evolution_absent_column(started_cluster_iceberg_no_spark):
     """
-    FAILS on unpatched code (Altinity#1545).
+    Discriminating test for the schema-evolution fall-through (highest risk).
+
+    The Iceberg schema declares ``id``+``data`` but the Parquet file was written
+    under the older schema and physically contains only ``id``.  With the
+    stats-less manifest, the fixed build must fall through to the Parquet reader,
+    read ``id`` for real, and synthesize the absent ``data`` column as NULL via
+    IcebergMetadata::getInitialSchemaByPath (file schema as initial_header).
 
-    With allow_experimental_iceberg_read_optimization=1 (the default), reading an
-    icebergLocal table whose manifest omits column statistics returns all-NULL rows
-    instead of real data.  The correct output is 3 rows: (1,'hello'), (2,'world'),
-    (3,'iceberg').
+    Unpatched build: the optimization marks BOTH columns (including ``id``)
+    constant NULL, so ``id`` comes back NULL too -- the discriminator.
     """
-    instance   = started_cluster_iceberg_no_spark.instances["node1"]
-    table_name = "test_opt_on_stats_less_" + get_uuid_str()
-    container_base = (
-        f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
-    )
+    instance       = started_cluster_iceberg_no_spark.instances["node1"]
+    table_name     = "test_stats_less_evolve_" + get_uuid_str()
+    container_base = _container_base(table_name)
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base)
-        _upload_to_container(instance, local_dir, container_base)
-
-    result = instance.query(
-        f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
-        " ORDER BY id"
-        " SETTINGS allow_experimental_iceberg_read_optimization=1"
-    ).strip()
+        local_dir = _create_iceberg_table(
+            tmpdir, table_name, container_base,
+            manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
+            file_has_data_column=False,
+            schema_evolution=True,
+        )
+        _upload_to_node(instance, local_dir, container_base)
 
-    assert result == "1\thello\n2\tworld\n3\ticeberg", (
+    result = _query_local(instance, container_base)
+    assert result == _DATA_NULL_RESULT, (
         f"Got: {result!r}\n"
-        "All-NULL rows means the bug fired: empty columns_infos caused the "
-        "optimization to treat every nullable column as an absent schema-evolution "
-        "column and inject constant NULL (need_only_count=1)."
+        "Expected real 'id' with NULL 'data'.  If 'id' is also NULL the bug "
+        "fired; if the query errors, the Parquet fall-through did not synthesize "
+        "the schema-evolved-absent column as NULL."
     )
 
 
-def test_iceberg_local_returns_correct_rows_when_optimization_disabled(
-    started_cluster_iceberg_no_spark,
-):
+def test_stats_less_manifest_cluster_returns_real_data(started_cluster_iceberg_no_spark):
     """
-    PASSES on unpatched code.
-
-    With allow_experimental_iceberg_read_optimization=0 the optimization block is
-    skipped entirely and the Parquet file is read normally.  This test is a
-    regression guard: it must keep passing after the bug is fixed.
+    Cluster/serialization-path coverage for Altinity#1545.
+
+    icebergLocalCluster distributes file-level tasks to workers, serializing
+    DataFileMetaInfo (and its empty columns_info) through the cluster JSON path.
+    An empty columns_info must survive the round-trip and still read real data on
+    the worker, exercising the same fix on the *Cluster variants.  The table is
+    uploaded to every node because local storage is read by each worker from its
+    own filesystem.
     """
-    instance   = started_cluster_iceberg_no_spark.instances["node1"]
-    table_name = "test_opt_off_stats_less_" + get_uuid_str()
-    container_base = (
-        f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
-    )
+    cluster        = started_cluster_iceberg_no_spark
+    instance       = cluster.instances["node1"]
+    table_name     = "test_stats_less_cluster_" + get_uuid_str()
+    container_base = _container_base(table_name)
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        local_dir = _create_stats_less_iceberg_table(tmpdir, table_name, container_base)
-        _upload_to_container(instance, local_dir, container_base)
+        local_dir = _create_iceberg_table(
+            tmpdir, table_name, container_base,
+            manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
+        )
+        for node in cluster.instances.values():
+            _upload_to_node(node, local_dir, container_base)
 
     result = instance.query(
-        f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
+        f"SELECT * FROM icebergLocalCluster('cluster_simple', local, path='{container_base}', format=Parquet)"
         " ORDER BY id"
-        " SETTINGS allow_experimental_iceberg_read_optimization=0"
+        " SETTINGS allow_experimental_iceberg_read_optimization=1"
     ).strip()
-
-    assert result == "1\thello\n2\tworld\n3\ticeberg"
+    assert result == _REAL_DATA_RESULT, (
+        f"Got: {result!r}\n"
+        "All-NULL rows mean empty columns_info was mishandled on the worker after "
+        "the cluster JSON round-trip."
+    )
 
 
-def test_iceberg_local_partial_stats_manifest_reads_correctly(
-    started_cluster_iceberg_no_spark,
-):
+def test_non_empty_stats_absent_column_still_null(started_cluster_iceberg_no_spark):
     """
-    Regression test for Altinity#1545: partial stats (value_counts only).
+    Non-regression guard: the optimization's legitimate absent-NULL path.
 
-    A manifest that includes value_counts but omits column_sizes and
-    null_value_counts still populates columns_info.  The fix must treat the
-    columns as real (not absent), so real data is returned.
+    The Iceberg schema declares ``id``+``data`` but the Parquet file contains
+    only ``id``, and the manifest carries stats for ``id`` only (non-empty
+    columns_info).  The optimization must keep marking the genuinely-absent
+    ``data`` column NULL while ``id`` reads real values.  Passes on both builds;
+    the fix only changes the empty-columns_info case.
     """
-    instance   = started_cluster_iceberg_no_spark.instances["node1"]
-    table_name = "test_opt_on_partial_stats_" + get_uuid_str()
-    container_base = (
-        f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
-    )
+    instance       = started_cluster_iceberg_no_spark.instances["node1"]
+    table_name     = "test_nonempty_absent_" + get_uuid_str()
+    container_base = _container_base(table_name)
 
-    # value_counts for field_id 1 (id) and 2 (data): 3 rows each.
-    extra = {
-        "value_counts": [
-            {"key": 1, "value": 3},
-            {"key": 2, "value": 3},
-        ],
-    }
+    # value_counts for field_id 1 (id) only -> columns_info = {id}.
+    stats = {"value_counts": [{"key": 1, "value": len(_IDS)}]}
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        local_dir = _create_iceberg_table_with_schema(
+        local_dir = _create_iceberg_table(
             tmpdir, table_name, container_base,
-            _MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR,
-            manifest_extra_fields=extra,
+            manifest_entry_schema_str=_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR,
+            file_has_data_column=False,
+            data_file_stats=stats,
         )
-        _upload_to_container(instance, local_dir, container_base)
+        _upload_to_node(instance, local_dir, container_base)
 
-    result = instance.query(
-        f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
-        " ORDER BY id"
-        " SETTINGS allow_experimental_iceberg_read_optimization=1"
-    ).strip()
-
-    assert result == "1\thello\n2\tworld\n3\ticeberg", (
+    result = _query_local(instance, container_base)
+    assert result == _DATA_NULL_RESULT, (
         f"Got: {result!r}\n"
-        "Partial-stats manifest (value_counts only) should not trigger the absent-NULL "
-        "path because columns_info is non-empty.  All-NULL rows means the guard "
-        "fired incorrectly for partial-stats manifests."
+        "With non-empty stats the optimization must still read 'id' and inject "
+        "the absent 'data' column as NULL."
     )
 
 
-def test_iceberg_local_full_stats_manifest_reads_correctly(
-    started_cluster_iceberg_no_spark,
+@pytest.mark.parametrize(
+    "manifest_entry_schema_str,stats",
+    [
+        (
+            _MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR,
+            {"value_counts": [{"key": 1, "value": 3}, {"key": 2, "value": 3}]},
+        ),
+        (
+            _MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR,
+            {
+                "column_sizes":      [{"key": 1, "value": 64}, {"key": 2, "value": 128}],
+                "value_counts":      [{"key": 1, "value": 3}, {"key": 2, "value": 3}],
+                "null_value_counts": [{"key": 1, "value": 0}, {"key": 2, "value": 0}],
+            },
+        ),
+    ],
+    ids=["partial_stats", "full_stats"],
+)
+def test_non_empty_stats_returns_real_data(
+    started_cluster_iceberg_no_spark, manifest_entry_schema_str, stats
 ):
     """
-    Control test: full-stats manifest (Spark-like) must continue to return real data.
-
-    A manifest with all three stats fields (column_sizes, value_counts,
-    null_value_counts) is the common case written by Spark.  This test verifies the
-    optimization does not regress for the normal path after the fix.
+    Non-regression guard: manifests that do populate columns_info (partial
+    value_counts-only, and full Spark-style stats) must keep returning real data
+    when the optimization is enabled.  These are the cases the empty-stats guard
+    must not disturb.
     """
-    instance   = started_cluster_iceberg_no_spark.instances["node1"]
-    table_name = "test_opt_on_full_stats_" + get_uuid_str()
-    container_base = (
-        f"/var/lib/clickhouse/user_files/iceberg_data/default/{table_name}"
-    )
-
-    # Full stats for field_id 1 (id) and 2 (data).
-    extra = {
-        "column_sizes": [
-            {"key": 1, "value": 64},
-            {"key": 2, "value": 128},
-        ],
-        "value_counts": [
-            {"key": 1, "value": 3},
-            {"key": 2, "value": 3},
-        ],
-        "null_value_counts": [
-            {"key": 1, "value": 0},
-            {"key": 2, "value": 0},
-        ],
-    }
+    instance       = started_cluster_iceberg_no_spark.instances["node1"]
+    table_name     = "test_nonempty_real_" + get_uuid_str()
+    container_base = _container_base(table_name)
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        local_dir = _create_iceberg_table_with_schema(
+        local_dir = _create_iceberg_table(
             tmpdir, table_name, container_base,
-            _MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR,
-            manifest_extra_fields=extra,
+            manifest_entry_schema_str=manifest_entry_schema_str,
+            data_file_stats=stats,
         )
-        _upload_to_container(instance, local_dir, container_base)
-
-    result = instance.query(
-        f"SELECT * FROM icebergLocal(local, path='{container_base}', format=Parquet)"
-        " ORDER BY id"
-        " SETTINGS allow_experimental_iceberg_read_optimization=1"
-    ).strip()
+        _upload_to_node(instance, local_dir, container_base)
 
-    assert result == "1\thello\n2\tworld\n3\ticeberg", (
+    result = _query_local(instance, container_base)
+    assert result == _REAL_DATA_RESULT, (
         f"Got: {result!r}\n"
-        "Full-stats manifest should return real data when optimization is enabled."
+        "A manifest with non-empty stats must return real data when the "
+        "optimization is enabled."
     )

From 17ef6e71a7bed981355158186229316af81e1361 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 11 May 2026 16:37:40 -0300
Subject: [PATCH 03/33] cast like insert select

---
 src/Storages/MergeTree/ExportPartTask.cpp     |  29 ++
 src/Storages/MergeTree/MergeTreeData.cpp      |  23 +-
 src/Storages/StorageReplicatedMergeTree.cpp   |  23 +-
 .../test.py                                   | 263 ++++++++++++++++++
 ...rge_tree_part_to_object_storage_simple.sql |   9 +-
 ...rge_tree_part_to_object_storage_simple.sql |   4 +-
 6 files changed, 337 insertions(+), 14 deletions(-)

diff --git a/src/Storages/MergeTree/ExportPartTask.cpp b/src/Storages/MergeTree/ExportPartTask.cpp
index 94b25f63ad9f..f2035577bbf6 100644
--- a/src/Storages/MergeTree/ExportPartTask.cpp
+++ b/src/Storages/MergeTree/ExportPartTask.cpp
@@ -99,6 +99,31 @@ namespace
         }
     }
 
+    /// Mirrors `InterpreterInsertQuery::addInsertToSelectPipeline`: positional match,
+    /// destination header = `getSampleBlockNonMaterialized()`, all type bridging is done
+    /// by the CAST inside `makeConvertingActions`. No pre-validation, no per-column
+    /// lossy/non-lossy classification — restrictions are exactly what INSERT SELECT enforces.
+    void addExportConvertingActions(
+        QueryPlan & plan_for_part,
+        const IStorage & destination_storage,
+        const ContextPtr & local_context)
+    {
+        const auto destination_header
+            = destination_storage.getInMemoryMetadataPtr()->getSampleBlockNonMaterialized();
+
+        auto dag = ActionsDAG::makeConvertingActions(
+            plan_for_part.getCurrentHeader()->getColumnsWithTypeAndName(),
+            destination_header.getColumnsWithTypeAndName(),
+            ActionsDAG::MatchColumnsMode::Position,
+            local_context);
+
+        auto expression_step = std::make_unique<ExpressionStep>(
+            plan_for_part.getCurrentHeader(),
+            std::move(dag));
+        expression_step->setStepDescription("Convert source columns to destination types for export");
+        plan_for_part.addStep(std::move(expression_step));
+    }
+
     String buildDestinationFilename(
         const MergeTreePartExportManifest & manifest,
         const StorageID & storage_id,
@@ -261,6 +286,10 @@ bool ExportPartTask::executeStep()
         /// This is a hack that materializes the columns before the export so they can be exported to tables that have matching columns
         materializeSpecialColumns(plan_for_part.getCurrentHeader(), metadata_snapshot, local_context, plan_for_part);
 
+        /// Align the pipeline header with the destination's non-materialized sample block,
+        /// using the same `makeConvertingActions(Position)` call INSERT SELECT performs.
+        addExportConvertingActions(plan_for_part, *destination_storage, local_context);
+
         QueryPlanOptimizationSettings optimization_settings(local_context);
         auto pipeline_settings = BuildQueryPipelineSettings(local_context);
         auto builder = plan_for_part.buildQueryPipeline(optimization_settings, pipeline_settings);
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index a07bd992066e..9dfb4edf20bc 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -30,6 +30,7 @@
 #include <Common/thread_local_rng.h>
 #include <Storages/MergeTree/ExportPartTask.h>
 #include <Storages/MergeTree/ExportPartitionUtils.h>
+#include <Interpreters/ActionsDAG.h>
 #include <Processors/Executors/CompletedPipelineExecutor.h>
 #include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
 #include <Storages/MergeTree/MergeTreeSequentialSource.h>
@@ -6750,14 +6751,24 @@ void MergeTreeData::exportPartToTable(
 #endif
     }
 
-    const auto & source_columns = source_metadata_ptr->getColumns();
+    /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` —
+    /// positional matching with CAST. We surface structural mismatches synchronously at
+    /// ALTER TABLE time by building (and discarding) the same converting DAG the worker
+    /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch,
+    /// untyped cast) fails here; anything it accepts will be CAST at runtime.
+    {
+        Block source_sample_block;
+        for (const auto & column : source_metadata_ptr->getColumns().getReadable())
+            source_sample_block.insert({column.type->createColumn(), column.type, column.name});
 
-    const auto & destination_columns = destination_metadata_ptr->getColumns();
+        const auto destination_sample_block = destination_metadata_ptr->getSampleBlockNonMaterialized();
 
-    /// compare all source readable columns with all destination insertable columns
-    /// this allows us to skip ephemeral columns
-    if (source_columns.getReadable().sizeOfDifference(destination_columns.getInsertable()))
-        throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure");
+        (void) ActionsDAG::makeConvertingActions(
+            source_sample_block.getColumnsWithTypeAndName(),
+            destination_sample_block.getColumnsWithTypeAndName(),
+            ActionsDAG::MatchColumnsMode::Position,
+            query_context);
+    }
 
     /// for data lakes this check is performed differently. It is a bit more complex as we need to convert the iceberg partition spec
     /// to the MergeTree partition spec and compare the two.
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 7aaecac34832..9104884a2a91 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -79,6 +79,7 @@
 #include <Storages/MergeTree/PatchParts/PatchPartsLock.h>
 #include <Storages/MergeTree/PatchParts/PatchPartsUtils.h>
 #include <Storages/MergeTree/ExportPartitionUtils.h>
+#include <Interpreters/ActionsDAG.h>
 
 #include <Databases/DatabaseOnDisk.h>
 #include <Databases/DatabaseReplicated.h>
@@ -8387,10 +8388,24 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
     auto src_snapshot = getInMemoryMetadataPtr();
     auto destination_snapshot = dest_storage->getInMemoryMetadataPtr();
 
-    /// compare all source readable columns with all destination insertable columns
-    /// this allows us to skip ephemeral columns
-    if (src_snapshot->getColumns().getReadable().sizeOfDifference(destination_snapshot->getColumns().getInsertable()))
-        throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure");
+    /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` —
+    /// positional matching with CAST. We surface structural mismatches synchronously at
+    /// ALTER TABLE time by building (and discarding) the same converting DAG the worker
+    /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch,
+    /// untyped cast) fails here; anything it accepts will be CAST at runtime.
+    {
+        Block source_sample_block;
+        for (const auto & column : src_snapshot->getColumns().getReadable())
+            source_sample_block.insert({column.type->createColumn(), column.type, column.name});
+
+        const auto destination_sample_block = destination_snapshot->getSampleBlockNonMaterialized();
+
+        (void) ActionsDAG::makeConvertingActions(
+            source_sample_block.getColumnsWithTypeAndName(),
+            destination_sample_block.getColumnsWithTypeAndName(),
+            ActionsDAG::MatchColumnsMode::Position,
+            query_context);
+    }
 
     /// for data lakes this check is performed later. It is a bit more complex as we need to convert the iceberg partition spec
     /// to the MergeTree partition spec and compare the two.
diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
index b371727a08cd..9dc3b2f8434e 100644
--- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
+++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
@@ -121,6 +121,42 @@ def assert_part_log(node, table: str, part: str) -> None:
     )
 
 
+def wait_for_failed_export_part(
+    node,
+    table: str,
+    part: str,
+    timeout: int = 60,
+    poll_interval: float = 0.5,
+) -> str:
+    """Poll system.part_log until a failed ExportPart event appears for *part*.
+
+    Returns the exception text recorded on the log entry, which lets callers
+    assert on the specific runtime error that propagated from the export worker.
+    """
+    deadline = time.time() + timeout
+    last_seen = ""
+    while time.time() < deadline:
+        node.query("SYSTEM FLUSH LOGS")
+        row = node.query(
+            f"SELECT error, exception FROM system.part_log "
+            f"WHERE event_type = 'ExportPart' "
+            f"AND database = currentDatabase() "
+            f"AND table = '{table}' "
+            f"AND part_name = '{part}' "
+            f"AND error != 0 "
+            f"ORDER BY event_time DESC LIMIT 1"
+        ).strip()
+        if row:
+            _error, exception = row.split("\t", 1)
+            return exception
+        last_seen = row
+        time.sleep(poll_interval)
+    raise TimeoutError(
+        f"Failed ExportPart event for part {part!r} in table {table!r} "
+        f"did not appear in system.part_log within {timeout}s (last row: {last_seen!r})"
+    )
+
+
 # ---------------------------------------------------------------------------
 # Tests
 # ---------------------------------------------------------------------------
@@ -479,3 +515,230 @@ def test_export_part_writes_column_statistics(cluster):
 
     node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
     node.query(f"DROP TABLE IF EXISTS {iceberg}")
+
+
+# ---------------------------------------------------------------------------
+# Schema compatibility (INSERT SELECT mirror)
+#
+# EXPORT PART builds an ActionsDAG via makeConvertingActions(Position) — the
+# same primitive that powers INSERT INTO dest SELECT * FROM src.  The tests
+# below pin that contract:
+#
+#   * Column counts must match positionally; otherwise the ALTER is rejected
+#     synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing lands.
+#   * Destination column names need not match source names — positional pairs
+#     are CAST element-wise.
+#   * Castable type differences are accepted at validation time (widening and
+#     even lossy narrowing).  Lossiness only matters at runtime: if the value
+#     does not fit the destination type, the async export worker fails and the
+#     failure surfaces via system.part_log; the Iceberg table is untouched.
+# ---------------------------------------------------------------------------
+
+
+def test_export_part_column_count_mismatch_source_more_is_rejected(cluster):
+    """
+    Source has 3 columns (id, year, extra), destination has 2 (id, year).
+    The ALTER must be rejected synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH
+    and the Iceberg table must remain empty.
+    """
+    node = cluster.instances["node1"]
+    sfx = unique_suffix()
+    mt = f"mt_count_more_{sfx}"
+    iceberg = f"iceberg_count_more_{sfx}"
+
+    make_mt(node, mt, "id Int32, year Int32, extra String", "year")
+    make_iceberg_s3(node, iceberg, "id Int32, year Int32", "year")
+
+    node.query(f"INSERT INTO {mt} VALUES (1, 2020, 'foo'), (2, 2020, 'bar')")
+    part_2020 = get_part(node, mt, "2020")
+
+    error = node.query_and_get_error(
+        f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} "
+        f"SETTINGS allow_experimental_export_merge_tree_part = 1, "
+        f"allow_experimental_insert_into_iceberg = 1"
+    )
+    assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, (
+        f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source>dest column count, "
+        f"got: {error!r}"
+    )
+
+    node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
+    node.query(f"DROP TABLE IF EXISTS {iceberg}")
+
+
+def test_export_part_column_count_mismatch_source_fewer_is_rejected(cluster):
+    """
+    Source has 2 columns (id, year), destination has 3 (id, year, extra).
+    Same expected synchronous rejection as the source>dest case.
+    """
+    node = cluster.instances["node1"]
+    sfx = unique_suffix()
+    mt = f"mt_count_fewer_{sfx}"
+    iceberg = f"iceberg_count_fewer_{sfx}"
+
+    make_mt(node, mt, "id Int32, year Int32", "year")
+    make_iceberg_s3(node, iceberg, "id Int32, year Int32, extra String", "year")
+
+    node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020)")
+    part_2020 = get_part(node, mt, "2020")
+
+    error = node.query_and_get_error(
+        f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} "
+        f"SETTINGS allow_experimental_export_merge_tree_part = 1, "
+        f"allow_experimental_insert_into_iceberg = 1"
+    )
+    assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, (
+        f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source<dest column count, "
+        f"got: {error!r}"
+    )
+
+    node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
+    node.query(f"DROP TABLE IF EXISTS {iceberg}")
+
+
+def test_export_part_with_renamed_destination_column(cluster):
+    """
+    Source has column `id`, destination has the same shape but the column is
+    named `renamed_id`.  Positional matching must accept the export and the
+    data must land in the destination under the new name.
+    """
+    node = cluster.instances["node1"]
+    sfx = unique_suffix()
+    mt = f"mt_renamed_{sfx}"
+    iceberg = f"iceberg_renamed_{sfx}"
+
+    make_mt(node, mt, "id Int32, year Int32", "year")
+    make_iceberg_s3(node, iceberg, "renamed_id Int32, year Int32", "year")
+
+    node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020), (3, 2020)")
+    part_2020 = get_part(node, mt, "2020")
+
+    export_part(node, mt, part_2020, iceberg)
+    wait_for_export_part(node, mt, part_2020)
+
+    count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
+    assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}"
+
+    result = node.query(
+        f"SELECT renamed_id, year FROM {iceberg} ORDER BY renamed_id"
+    ).strip()
+    assert result == "1\t2020\n2\t2020\n3\t2020", (
+        f"Unexpected data under renamed column:\n{result}"
+    )
+
+    assert_part_log(node, mt, part_2020)
+
+    node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
+    node.query(f"DROP TABLE IF EXISTS {iceberg}")
+
+
+def test_export_part_with_castable_widening(cluster):
+    """
+    Source column is Int32, destination expects Int64.  makeConvertingActions
+    inserts a widening CAST; the export must succeed and the values must land
+    as Int64 in Iceberg.
+    """
+    node = cluster.instances["node1"]
+    sfx = unique_suffix()
+    mt = f"mt_widen_{sfx}"
+    iceberg = f"iceberg_widen_{sfx}"
+
+    make_mt(node, mt, "id Int32, year Int32", "year")
+    make_iceberg_s3(node, iceberg, "id Int64, year Int32", "year")
+
+    node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020)")
+    part_2020 = get_part(node, mt, "2020")
+
+    export_part(node, mt, part_2020, iceberg)
+    wait_for_export_part(node, mt, part_2020)
+
+    count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
+    assert count == 2, f"Expected 2 rows in Iceberg table after export, got {count}"
+
+    result = node.query(
+        f"SELECT id, toTypeName(id), year FROM {iceberg} ORDER BY id"
+    ).strip()
+    assert result == "1\tInt64\t2020\n2\tInt64\t2020", (
+        f"Unexpected widened data:\n{result}"
+    )
+
+    assert_part_log(node, mt, part_2020)
+
+    node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
+    node.query(f"DROP TABLE IF EXISTS {iceberg}")
+
+
+def test_export_part_with_castable_narrowing_values_fit(cluster):
+    """
+    Source column is Int64, destination expects Int32 — lossy in general but
+    not blocked, mirroring INSERT SELECT.
+    """
+    node = cluster.instances["node1"]
+    sfx = unique_suffix()
+    mt = f"mt_narrow_fit_{sfx}"
+    iceberg = f"iceberg_narrow_fit_{sfx}"
+
+    make_mt(node, mt, "id Int64, year Int32", "year")
+    make_iceberg_s3(node, iceberg, "id Int32, year Int32", "year")
+
+    node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020)")
+    part_2020 = get_part(node, mt, "2020")
+
+    export_part(node, mt, part_2020, iceberg)
+    wait_for_export_part(node, mt, part_2020)
+
+    count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
+    assert count == 2, f"Expected 2 rows in Iceberg table after export, got {count}"
+
+    result = node.query(
+        f"SELECT id, toTypeName(id), year FROM {iceberg} ORDER BY id"
+    ).strip()
+    assert result == "1\tInt32\t2020\n2\tInt32\t2020", (
+        f"Unexpected narrowed data:\n{result}"
+    )
+
+    assert_part_log(node, mt, part_2020)
+
+    node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
+    node.query(f"DROP TABLE IF EXISTS {iceberg}")
+
+
+def test_export_part_runtime_cast_failure_propagates_async(cluster):
+    """
+    Source has a String column whose value cannot be parsed as the destination
+    Int32 column.  Validation accepts the type pair (String -> Int32 is a
+    defined conversion) so the ALTER returns synchronously; the actual cast
+    runs in the async export worker and must fail there.  The failure is
+    propagated to system.part_log (non-zero `error` column with a non-empty
+    `exception` text) and the Iceberg table is left empty.
+
+    (Integer narrowing overflow is not used here because the internal cast
+    `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps
+    on overflow rather than throwing — that would still produce a successful
+    export with garbage data, not a propagated failure.)
+    """
+    node = cluster.instances["node1"]
+    sfx = unique_suffix()
+    mt = f"mt_runtime_cast_fail_{sfx}"
+    iceberg = f"iceberg_runtime_cast_fail_{sfx}"
+
+    make_mt(node, mt, "id String, year Int32", "year")
+    make_iceberg_s3(node, iceberg, "id Int32, year Int32", "year")
+
+    node.query(f"INSERT INTO {mt} VALUES ('not a number', 2020)")
+    part_2020 = get_part(node, mt, "2020")
+
+    export_part(node, mt, part_2020, iceberg)
+
+    exception = wait_for_failed_export_part(node, mt, part_2020)
+    assert exception, (
+        f"Expected non-empty exception text on failed ExportPart entry, got {exception!r}"
+    )
+
+    count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
+    assert count == 0, (
+        f"Expected 0 rows in Iceberg table after failed export, got {count}"
+    )
+
+    node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
+    node.query(f"DROP TABLE IF EXISTS {iceberg}")
diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
index 7cb70af024a2..ceb83191f93a 100644
--- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
@@ -9,10 +9,12 @@ CREATE TABLE 03572_mt_table (id UInt64, year UInt16) ENGINE = MergeTree() PARTIT
 INSERT INTO 03572_mt_table VALUES (1, 2020);
 
 -- Create a table with a different partition key and export a partition to it. It should throw
+-- on the partition-key AST mismatch (schema compat now follows INSERT SELECT positional semantics,
+-- so the column shape matches and the partition-key check is what fires).
 CREATE TABLE 03572_invalid_schema_table (id UInt64, x UInt16) ENGINE = S3(s3_conn, filename='03572_invalid_schema_table', format='Parquet', partition_strategy='hive') PARTITION BY x;
 
 ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_invalid_schema_table
-SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError INCOMPATIBLE_COLUMNS}
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
 
 DROP TABLE 03572_invalid_schema_table;
 
@@ -27,13 +29,14 @@ ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE FUNCTION extractKey
 -- It is a table function, but the engine does not support exports/imports, should throw
 ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE FUNCTION url('a.parquet'); -- {serverError NOT_IMPLEMENTED}
 
--- Test that destination table can not have a column that matches the source ephemeral
+-- Source-side ephemeral columns are not readable, so the destination must not declare a matching
+-- ordinary column or the column count will not align under positional matching.
 CREATE TABLE 03572_ephemeral_mt_table (id UInt64, year UInt16, name String EPHEMERAL) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
 
 CREATE TABLE 03572_matching_ephemeral_s3_table (id UInt64, year UInt16, name String) ENGINE = S3(s3_conn, filename='03572_matching_ephemeral_s3_table', format='Parquet', partition_strategy='hive') PARTITION BY year;
 
 INSERT INTO 03572_ephemeral_mt_table (id, year, name) VALUES (1, 2020, 'alice');
 
-ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError INCOMPATIBLE_COLUMNS}
+ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError NUMBER_OF_COLUMNS_DOESNT_MATCH}
 
 DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table;
diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
index f8f23532f0a7..0f6647861e43 100644
--- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
@@ -7,10 +7,12 @@ CREATE TABLE 03572_rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTr
 INSERT INTO 03572_rmt_table VALUES (1, 2020);
 
 -- Create a table with a different partition key and export a partition to it. It should throw
+-- on the partition-key AST mismatch (schema compat now follows INSERT SELECT positional semantics,
+-- so the column shape matches and the partition-key check is what fires).
 CREATE TABLE 03572_invalid_schema_table (id UInt64, x UInt16) ENGINE = S3(s3_conn, filename='03572_invalid_schema_table', format='Parquet', partition_strategy='hive') PARTITION BY x;
 
 ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table
-SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError INCOMPATIBLE_COLUMNS}
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
 
 DROP TABLE 03572_invalid_schema_table;
 

From c88f69eb66f5cb5e794995d40e140c903d94da0a Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 22 May 2026 18:51:08 -0300
Subject: [PATCH 04/33] tmp

---
 .../MergeTree/ExportPartitionUtils.cpp        |  60 +++++++++-
 src/Storages/MergeTree/ExportPartitionUtils.h |  35 +++++-
 src/Storages/MergeTree/MergeTreeData.cpp      |  22 +++-
 src/Storages/StorageReplicatedMergeTree.cpp   |  22 +++-
 .../test.py                                   |  67 +++++++++++
 .../test.py                                   |  66 ++++++++++
 .../test.py                                   | 113 ++++++++++++++++++
 ...rge_tree_part_to_object_storage_simple.sql |  16 ++-
 ...rge_tree_part_to_object_storage_simple.sql |  14 ++-
 9 files changed, 400 insertions(+), 15 deletions(-)

diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp
index 4a447c8de3ab..d5df9b6bea2a 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.cpp
+++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp
@@ -403,8 +403,18 @@ namespace ExportPartitionUtils
 #if USE_AVRO
     void verifyIcebergPartitionCompatibility(
         const Poco::JSON::Object::Ptr & metadata_object,
-        const ASTPtr & partition_key_ast)
+        const ASTPtr & partition_key_ast,
+        const StorageMetadataPtr & source_metadata,
+        const StorageMetadataPtr & destination_metadata,
+        const StorageID & destination_storage_id)
     {
+        /// The Iceberg manifest's partition values are derived from the SOURCE
+        /// part's minmax block (see `IcebergMetadata::commitExportPartitionTransaction`)
+        /// while the row data is CAST to the destination Iceberg schema by
+        /// `addExportConvertingActions`. Reject castable type differences on
+        /// partition-key columns up front so the two cannot disagree.
+        verifyPartitionColumnsExactTypeMatch(source_metadata, destination_metadata, destination_storage_id);
+
         const auto original_schema_id = metadata_object->getValue<Int64>(Iceberg::f_current_schema_id);
         const auto partition_spec_id  = metadata_object->getValue<Int64>(Iceberg::f_default_spec_id);
 
@@ -519,6 +529,54 @@ namespace ExportPartitionUtils
         }
     }
 #endif
+
+    void verifyPartitionColumnsExactTypeMatch(
+        const StorageMetadataPtr & source_metadata,
+        const StorageMetadataPtr & destination_metadata,
+        const StorageID & destination_storage_id)
+    {
+        if (!source_metadata->hasPartitionKey())
+            return;
+
+        /// Use the partition expression's required columns (the ones that flow
+        /// into the destination's partition strategy via
+        /// `minmax_idx->getBlock(storage)`), not the partition expression's
+        /// sample-block output columns.  This catches both identity
+        /// partitioning (`PARTITION BY year`) and computed partitioning
+        /// (`PARTITION BY toYearNumSinceEpoch(event_date)` — required column
+        /// is `event_date`).
+        const auto required_columns
+            = source_metadata->getPartitionKey().expression->getRequiredColumnsWithTypes();
+
+        const auto & destination_columns = destination_metadata->getColumns();
+
+        for (const auto & source_column : required_columns)
+        {
+            if (!destination_columns.has(source_column.name))
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                    "Cannot export partition: source partition key references column '{}' "
+                    "which does not exist on the destination table {}. "
+                    "EXPORT does not re-evaluate the partition key against the destination's schema, "
+                    "so every column the source partition expression depends on must be present "
+                    "on the destination with the same type.",
+                    source_column.name, destination_storage_id.getFullTableName());
+
+            const auto destination_type
+                = destination_columns.getPhysical(source_column.name).type;
+            if (!source_column.type->equals(*destination_type))
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                    "Cannot export partition: partition key column '{}' has type {} on the source "
+                    "but {} on the destination table {}. "
+                    "EXPORT does not re-evaluate the partition key against the destination's schema, "
+                    "so castable type differences are not allowed for partition columns "
+                    "(the partition path and any Iceberg partition values are derived from the "
+                    "source-typed minmax block, while the row data is CAST to the destination type).",
+                    source_column.name,
+                    source_column.type->getName(),
+                    destination_type->getName(),
+                    destination_storage_id.getFullTableName());
+        }
+    }
 }
 
 }
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h
index eb67d288d71e..958a8a819457 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.h
+++ b/src/Storages/MergeTree/ExportPartitionUtils.h
@@ -7,6 +7,7 @@
 #include <Common/Logger.h>
 #include <Common/ZooKeeper/ZooKeeper.h>
 #include "Storages/IStorage.h"
+#include <Storages/StorageInMemoryMetadata.h>
 #include <config.h>
 
 #if USE_AVRO
@@ -88,13 +89,41 @@ namespace ExportPartitionUtils
         const std::string & exception_message,
         const LoggerPtr & log);
 
+    /// EXPORT PART/PARTITION never re-evaluates the partition key against the
+    /// destination's schema: the partition path (object storage) and the
+    /// Iceberg partition values (data lake) are derived from the SOURCE part's
+    /// minmax block in ExportPartTask::executeStep. Therefore every column the
+    /// SOURCE partition expression depends on must have an identically-typed
+    /// counterpart on the destination — castable but non-equal types are not
+    /// safe here even though `makeConvertingActions` would accept them.
+    ///
+    /// Called directly from the non-data-lake branch of
+    /// `MergeTreeData::exportPartToTable` /
+    /// `StorageReplicatedMergeTree::exportPartitionToTable`, and indirectly
+    /// for the Iceberg branch via `verifyIcebergPartitionCompatibility` which
+    /// embeds this check.  Throws BAD_ARGUMENTS on missing column or type
+    /// mismatch.
+    void verifyPartitionColumnsExactTypeMatch(
+        const StorageMetadataPtr & source_metadata,
+        const StorageMetadataPtr & destination_metadata,
+        const StorageID & destination_storage_id);
+
 #if USE_AVRO
     /// Verifies that the source MergeTree partition key is compatible with the
-    /// destination Iceberg partition spec by comparing field source-ids and
-    /// transforms in order. Throws BAD_ARGUMENTS if they do not match.
+    /// destination Iceberg partition spec by:
+    ///   * Comparing partition-field source-ids and transforms in order
+    ///     against the Iceberg spec derived from the MergeTree PARTITION BY AST.
+    ///   * Embedding `verifyPartitionColumnsExactTypeMatch` so that columns
+    ///     flowing through the partition key have identical ClickHouse types on
+    ///     both sides — the Iceberg manifest's partition values come from the
+    ///     SOURCE-typed minmax block and cannot be reconciled later.
+    /// Throws BAD_ARGUMENTS on any mismatch.
     void verifyIcebergPartitionCompatibility(
         const Poco::JSON::Object::Ptr & metadata_object,
-        const ASTPtr & partition_key_ast);
+        const ASTPtr & partition_key_ast,
+        const StorageMetadataPtr & source_metadata,
+        const StorageMetadataPtr & destination_metadata,
+        const StorageID & destination_storage_id);
 #endif
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 9dfb4edf20bc..ee2b7d2250c8 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -6743,7 +6743,12 @@ void MergeTreeData::exportPartToTable(
             metadata_object->stringify(oss);
             iceberg_metadata_json = oss.str();
 
-            ExportPartitionUtils::verifyIcebergPartitionCompatibility(metadata_object, source_metadata_ptr->getPartitionKeyAST());
+            ExportPartitionUtils::verifyIcebergPartitionCompatibility(
+                metadata_object,
+                source_metadata_ptr->getPartitionKeyAST(),
+                source_metadata_ptr,
+                destination_metadata_ptr,
+                dest_storage->getStorageID());
         }
 #else
         (void)iceberg_metadata_json_;
@@ -6770,12 +6775,23 @@ void MergeTreeData::exportPartToTable(
             query_context);
     }
 
-    /// for data lakes this check is performed differently. It is a bit more complex as we need to convert the iceberg partition spec
-    /// to the MergeTree partition spec and compare the two.
+    /// For data lakes (Iceberg) the partition-key checks — both spec/transform
+    /// compatibility and ClickHouse partition-column exact-type equality — are
+    /// performed inside `verifyIcebergPartitionCompatibility` above.
+    /// For non-data-lake destinations we do the AST equivalence check here and
+    /// then run the partition-column exact-type check directly.
     if (!dest_storage->isDataLake())
     {
         if (query_to_string(source_metadata_ptr->getPartitionKeyAST()) != query_to_string(destination_metadata_ptr->getPartitionKeyAST()))
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key");
+
+        /// EXPORT does not re-evaluate the partition key against the destination's schema:
+        /// the hive partition path is derived from the SOURCE part's minmax block in
+        /// `ExportPartTask::executeStep`, while the row data is CAST to destination types
+        /// by `addExportConvertingActions`. Reject castable type differences on
+        /// partition-key columns synchronously so the path and the row data cannot disagree.
+        ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch(
+            source_metadata_ptr, destination_metadata_ptr, dest_storage->getStorageID());
     }
 
     auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated});
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 9104884a2a91..cb576f137546 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -8407,14 +8407,24 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
             query_context);
     }
 
-    /// for data lakes this check is performed later. It is a bit more complex as we need to convert the iceberg partition spec
-    /// to the MergeTree partition spec and compare the two.
+    /// For data lakes (Iceberg) the partition-key checks — both spec/transform
+    /// compatibility and ClickHouse partition-column exact-type equality — are
+    /// performed inside `verifyIcebergPartitionCompatibility` below (per partition).
+    /// For non-data-lake destinations we do the AST equivalence check here and
+    /// then run the partition-column exact-type check directly.
     if (!dest_storage->isDataLake())
     {
         if (query_to_string(src_snapshot->getPartitionKeyAST()) != query_to_string(destination_snapshot->getPartitionKeyAST()))
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key");
+
+        /// EXPORT does not re-evaluate the partition key against the destination's schema:
+        /// the hive partition path is derived from the SOURCE part's minmax block in
+        /// `ExportPartTask::executeStep`, while the row data is CAST to destination types
+        /// by `addExportConvertingActions`. Reject castable type differences on
+        /// partition-key columns synchronously so the path and the row data cannot disagree.
+        ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch(
+            src_snapshot, destination_snapshot, dest_storage->getStorageID());
     }
-    
 
     zkutil::ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly();
 
@@ -8586,7 +8596,11 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
         const auto metadata_object = iceberg_metadata->getMetadataJSON(query_context);
 
         ExportPartitionUtils::verifyIcebergPartitionCompatibility(
-            metadata_object, src_snapshot->getPartitionKeyAST());
+            metadata_object,
+            src_snapshot->getPartitionKeyAST(),
+            src_snapshot,
+            destination_snapshot,
+            dest_storage->getStorageID());
 
         std::ostringstream oss;     // STYLE_CHECK_ALLOW_STD_STRING_STREAM
         oss.exceptions(std::ios::failbit);
diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
index 9dc3b2f8434e..43ba24116526 100644
--- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
+++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
@@ -385,6 +385,73 @@ def test_export_part_with_year_transform_partition(cluster):
     node.query(f"DROP TABLE IF EXISTS {iceberg}")
 
 
+def test_export_part_partition_column_type_mismatch_is_rejected(cluster):
+    """
+    EXPORT PART must synchronously reject (BAD_ARGUMENTS) when a partition
+    column has different (even castable) types on source and destination.
+
+    The partition path / Iceberg partition values are computed from the
+    SOURCE part's minmax block in ExportPartTask::executeStep before
+    addExportConvertingActions casts the data, so a castable mismatch on
+    a partition column would let the path/value computation and the row
+    data disagree. The check in
+    ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch rejects this
+    at ALTER time.
+
+    Failing case: source `year Int32` -> destination `year Int64`, both
+    PARTITION BY year (identity transform). Both Int32 and Int64 are valid
+    Iceberg primitives, so verifyIcebergPartitionCompatibility happily
+    accepts the spec — only the exact-type check rejects.
+    """
+    node = cluster.instances["node1"]
+    sfx = unique_suffix()
+    mt = f"mt_pcol_type_mismatch_{sfx}"
+    iceberg = f"iceberg_pcol_type_mismatch_{sfx}"
+
+    make_mt(node, mt, "id Int32, year Int32", "year")
+    make_iceberg_s3(node, iceberg, "id Int32, year Int64", "year")
+
+    node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020), (3, 2020)")
+
+    part_2020 = get_part(node, mt, "2020")
+
+    error = node.query_and_get_error(
+        f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} "
+        f"SETTINGS allow_experimental_export_merge_tree_part = 1, "
+        f"allow_experimental_insert_into_iceberg = 1"
+    )
+    assert "BAD_ARGUMENTS" in error, (
+        f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
+        f"got: {error!r}"
+    )
+    assert "partition key column 'year'" in error, (
+        f"Expected the error message to identify the offending partition "
+        f"column 'year', got: {error!r}"
+    )
+
+    # No async work scheduled: no ExportPart row in system.part_log.
+    node.query("SYSTEM FLUSH LOGS")
+    part_log_count = node.query(
+        f"SELECT count() FROM system.part_log "
+        f"WHERE event_type = 'ExportPart' "
+        f"  AND database = currentDatabase() "
+        f"  AND table = '{mt}' "
+        f"  AND part_name = '{part_2020}'"
+    ).strip()
+    assert part_log_count == "0", (
+        f"Expected no ExportPart entries in system.part_log after a "
+        f"synchronously-rejected export, found {part_log_count}."
+    )
+
+    count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
+    assert count == 0, (
+        f"Expected 0 rows in Iceberg table after rejected export, got {count}"
+    )
+
+    node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
+    node.query(f"DROP TABLE IF EXISTS {iceberg}")
+
+
 def test_export_part_partition_key_mismatch_is_rejected(cluster):
     """
     EXPORT PART must synchronously reject (BAD_ARGUMENTS) when the source
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 8b49589a3005..e6e5127bdd72 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -746,6 +746,72 @@ def test_partition_key_compatibility_check(cluster):
     )
 
 
+def test_export_partition_partition_column_type_mismatch_is_rejected(cluster):
+    """
+    EXPORT PARTITION must synchronously reject (BAD_ARGUMENTS) when a
+    partition column has a different (even castable) type on the source
+    MergeTree and the destination Iceberg table.
+
+    The Iceberg destination's partition path is derived from the SOURCE
+    part's minmax block in ExportPartTask::executeStep, and the Iceberg
+    manifest's partition values come from the SOURCE-typed
+    `manifest.data_part->partition.value` Fields. Letting a castable type
+    mismatch through would mean the manifest/path computation and the row
+    data (which IS cast by addExportConvertingActions) disagree — not
+    equivalent to INSERT SELECT.
+
+    Failing case: source `year Int32` -> destination `year Int64`, both
+    PARTITION BY year (identity transform). Both are valid Iceberg primitives
+    so verifyIcebergPartitionCompatibility accepts the spec; only the new
+    exact-type check rejects.
+    """
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_pcol_type_mismatch_{uid}"
+    iceberg_table = f"iceberg_pcol_type_mismatch_{uid}"
+
+    # Source uses year Int32; destination Iceberg uses year Int64.
+    make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
+    node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020), (3, 2020)")
+
+    make_iceberg_s3(
+        node, iceberg_table, "id Int64, year Int64",
+        partition_by="year", s3_retry_attempts=3,
+    )
+
+    error = node.query_and_get_error(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}"
+    )
+    assert "BAD_ARGUMENTS" in error, (
+        f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
+        f"got: {error!r}"
+    )
+    assert "partition key column 'year'" in error, (
+        f"Expected the error message to identify the offending partition "
+        f"column 'year', got: {error!r}"
+    )
+
+    # No async work scheduled.
+    rows_in_system_view = node.query(
+        f"SELECT count() FROM system.replicated_partition_exports "
+        f"WHERE source_table = '{mt_table}' "
+        f"  AND destination_table = '{iceberg_table}' "
+        f"  AND partition_id = '2020'"
+    ).strip()
+    assert rows_in_system_view == "0", (
+        f"Expected no row in system.replicated_partition_exports after a "
+        f"synchronously-rejected export, got {rows_in_system_view}."
+    )
+
+    # Destination must remain empty.
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 0, (
+        f"Expected 0 rows in Iceberg destination after rejected export, "
+        f"got {count}"
+    )
+
+
 def test_export_ttl(cluster):
     """
     After a manifest TTL expires the same partition can be re-exported, and the
diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
index 3161e3b67100..5c3e656107bc 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
@@ -1532,6 +1532,119 @@ def test_export_partition_all(cluster):
     assert row_count == 3, f"Expected 3 rows in S3 after EXPORT PARTITION ALL, got {row_count}"
 
 
+def test_export_partition_partition_column_castable_type_mismatch(cluster):
+    """
+    Post-fix contract: EXPORT PARTITION must reject castable but non-equal
+    partition-column types synchronously.
+
+    Setup
+    -----
+        * Source (ReplicatedMergeTree): ``id UInt64, year String``,
+          ``PARTITION BY year``
+        * Destination (S3, partition_strategy='hive'): ``id UInt64,
+          year UInt16``, ``PARTITION BY year``
+
+    Why the synchronous rejection is required
+    -----------------------------------------
+    ``makeConvertingActions(Position)`` accepts ``String -> UInt16`` and the
+    PARTITION BY ASTs are textually identical, so the schema/AST validators
+    would let this through.  But EXPORT does not re-evaluate the partition
+    key against the destination's schema: in ``ExportPartTask::executeStep``
+    the block fed to ``destination_storage->import`` is built from the
+    source part's minmax index (SOURCE types).  If we let the export proceed
+    the hive partition path is computed from the un-cast SOURCE value while
+    the row data is the CAST value — silent divergence from INSERT SELECT.
+    The exact-type partition-column check in
+    ``ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch`` rejects
+    this pairing at ALTER time.
+
+    Assertions
+    ----------
+        * ``ALTER ... EXPORT PARTITION ID '<id>' TO TABLE ...`` raises a
+          ``BAD_ARGUMENTS`` exception whose text references the partition
+          column type mismatch.
+        * No row is created in ``system.replicated_partition_exports`` for
+          this (source_table, destination_table, partition_id) — nothing
+          was ever scheduled.
+        * No parquet file is written under any ``year=*/`` prefix in S3.
+    """
+    skip_if_remote_database_disk_enabled(cluster)
+    node = cluster.instances["replica1"]
+
+    postfix = str(uuid.uuid4()).replace("-", "_")
+    mt_table = f"pkey_cast_mismatch_partition_mt_{postfix}"
+    s3_table = f"pkey_cast_mismatch_partition_s3_{postfix}"
+
+    # Source: year String; destination: year UInt16. PARTITION BY year on
+    # both sides — same AST text — to defeat the AST equivalence check.
+    node.query(
+        f"CREATE TABLE {mt_table} (id UInt64, year String) "
+        f"ENGINE = ReplicatedMergeTree('/clickhouse/tables/{mt_table}', 'replica1') "
+        f"PARTITION BY year "
+        f"ORDER BY tuple()"
+    )
+    node.query(
+        f"CREATE TABLE {s3_table} (id UInt64, year UInt16) "
+        f"ENGINE = S3(s3_conn, filename='{s3_table}', "
+        f"format=Parquet, partition_strategy='hive') "
+        f"PARTITION BY year"
+    )
+
+    node.query(
+        f"INSERT INTO {mt_table} VALUES (1, '2020'), (2, '2020'), (3, '2020')"
+    )
+
+    # With a String partition column the partition_id is the SipHash of the
+    # value rather than the textual representation — look it up so we can
+    # reference the partition explicitly in EXPORT PARTITION ID and in
+    # subsequent system.replicated_partition_exports queries.
+    partition_id = node.query(
+        f"SELECT partition_id FROM system.parts "
+        f"WHERE database = currentDatabase() AND table = '{mt_table}' "
+        f"  AND active "
+        f"ORDER BY name LIMIT 1"
+    ).strip()
+    assert partition_id, (
+        "Expected one active part on the source table after INSERT; "
+        "system.parts returned nothing."
+    )
+
+    error = node.query_and_get_error(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{partition_id}' "
+        f"TO TABLE {s3_table}"
+    )
+    assert "BAD_ARGUMENTS" in error, (
+        f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
+        f"got: {error!r}"
+    )
+    assert "partition key column 'year'" in error, (
+        f"Expected the error message to identify the offending partition "
+        f"column 'year', got: {error!r}"
+    )
+
+    # Nothing scheduled: no row in system.replicated_partition_exports.
+    rows_in_system_view = node.query(
+        f"SELECT count() FROM system.replicated_partition_exports "
+        f"WHERE source_table = '{mt_table}' "
+        f"  AND destination_table = '{s3_table}' "
+        f"  AND partition_id = '{partition_id}'"
+    ).strip()
+    assert rows_in_system_view == "0", (
+        f"Expected no row in system.replicated_partition_exports after a "
+        f"synchronously-rejected export, got {rows_in_system_view}."
+    )
+
+    # Nothing written: no parquet file under any year=*/ partition prefix.
+    files_in_s3 = node.query(
+        f"SELECT count() FROM s3(s3_conn, "
+        f"filename='{s3_table}/year=*/*.parquet', format='One')"
+    ).strip()
+    assert files_in_s3 == "0", (
+        f"Expected no Parquet files in S3 after a synchronously-rejected "
+        f"export, found {files_in_s3}."
+    )
+
+
 def test_export_partition_all_failure_modes(cluster):
     """Cover the three values of `export_merge_tree_partition_all_on_error`.
 
diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
index ceb83191f93a..9357f4e5aa78 100644
--- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
@@ -1,6 +1,6 @@
 -- Tags: no-parallel, no-fasttest
 
-DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table;
+DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3;
 
 SET allow_experimental_export_merge_tree_part=1;
 
@@ -39,4 +39,16 @@ INSERT INTO 03572_ephemeral_mt_table (id, year, name) VALUES (1, 2020, 'alice');
 
 ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError NUMBER_OF_COLUMNS_DOESNT_MATCH}
 
-DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table;
+-- Castable but non-equal partition-column types must be rejected synchronously.
+-- The partition path is computed from the source part's minmax block (SOURCE types)
+-- BEFORE row data is CAST to destination types — letting String -> UInt16 through
+-- would cause silent divergence between partition path and row values.
+CREATE TABLE 03572_partition_type_mismatch_mt (id UInt64, year String) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+-- Partition-column type mismatch is checked synchronously BEFORE the part lookup,
+-- so we don't need to insert data nor use a real part name to provoke it.
+ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_partition_type_mismatch_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
+
+DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3;
diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
index 0f6647861e43..874b5f288cf7 100644
--- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
@@ -1,6 +1,6 @@
 -- Tags: no-parallel, no-fasttest
 
-DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table;
+DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3;
 
 CREATE TABLE 03572_rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt/03572_rmt_table', 'replica1') PARTITION BY year ORDER BY tuple();
 
@@ -21,4 +21,14 @@ CREATE TABLE 03572_invalid_schema_table (id UInt64, year UInt16) ENGINE = S3(s3_
 
 ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NOT_IMPLEMENTED}
 
-DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table;
+-- Castable but non-equal partition-column types must be rejected synchronously.
+-- The partition path is computed from the source part's minmax block (SOURCE types)
+-- BEFORE row data is CAST to destination types — letting String -> UInt16 through
+-- would cause silent divergence between partition path and row values.
+CREATE TABLE 03572_rmt_partition_type_mismatch_mt (id UInt64, year String) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_pcol_type/03572_rmt_partition_type_mismatch_mt', 'replica1') PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_rmt_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_partition_type_mismatch_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
+
+DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3;

From 67489f2e7f84ca9e2a124937c27859429a0aa2e6 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 8 Jun 2026 10:26:33 -0300
Subject: [PATCH 05/33] fix build and tests

---
 .../MergeTree/ExportPartitionUtils.cpp        |   1 +
 .../test.py                                   | 264 +++++++++++++++++-
 2 files changed, 264 insertions(+), 1 deletion(-)

diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp
index d5df9b6bea2a..f227d318a1fd 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.cpp
+++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp
@@ -10,6 +10,7 @@
 #include <filesystem>
 #include <thread>
 #include <Interpreters/Context.h>
+#include <Interpreters/ExpressionActions.h>
 
 #if USE_AVRO
 #include <Storages/ObjectStorage/DataLakes/Iceberg/Constant.h>
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index e6e5127bdd72..45aa565b3586 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -781,7 +781,8 @@ def test_export_partition_partition_column_type_mismatch_is_rejected(cluster):
     )
 
     error = node.query_and_get_error(
-        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}"
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
     )
     assert "BAD_ARGUMENTS" in error, (
         f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
@@ -1050,3 +1051,264 @@ def test_export_partition_writes_column_statistics(cluster):
 
     entries = fetch_manifest_entries(node, query_id)
     assert_exported_stats(entries)
+
+
+# ---------------------------------------------------------------------------
+# Schema compatibility (INSERT SELECT mirror)
+#
+# EXPORT PARTITION builds an ActionsDAG via makeConvertingActions(Position) — the
+# same primitive that powers INSERT INTO dest SELECT * FROM src.  The destination
+# compatibility validation in StorageReplicatedMergeTree::exportPartitionToTable
+# mirrors what the plain-MergeTree EXPORT PART path does in MergeTreeData, so the
+# tests below pin the same contract for the replicated partition export:
+#
+#   * Column counts must match positionally; otherwise the ALTER is rejected
+#     synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing is scheduled.
+#   * Destination column names need not match source names — positional pairs
+#     are CAST element-wise.
+#   * Castable type differences are accepted at validation time (widening and
+#     even lossy narrowing).  Lossiness only matters at runtime: if the value
+#     does not fit the destination type, the async export worker fails and the
+#     export is marked FAILED in system.replicated_partition_exports; the Iceberg
+#     table is left untouched.
+# ---------------------------------------------------------------------------
+
+
+def test_export_partition_column_count_mismatch_source_more_is_rejected(cluster):
+    """
+    Source has 3 columns (id, year, extra), destination has 2 (id, year).
+    The ALTER must be rejected synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH,
+    nothing must be scheduled in system.replicated_partition_exports, and the
+    Iceberg table must remain empty.
+    """
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_count_more_{uid}"
+    iceberg_table = f"iceberg_count_more_{uid}"
+
+    make_rmt(node, mt_table, "id Int64, year Int32, extra String", "year",
+             replica_name="replica1")
+    node.query(f"INSERT INTO {mt_table} VALUES (1, 2020, 'foo'), (2, 2020, 'bar')")
+
+    make_iceberg_s3(node, iceberg_table, "id Int64, year Int32", partition_by="year")
+
+    error = node.query_and_get_error(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
+    )
+    assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, (
+        f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source>dest column count, "
+        f"got: {error!r}"
+    )
+
+    rows_in_system_view = node.query(
+        f"SELECT count() FROM system.replicated_partition_exports "
+        f"WHERE source_table = '{mt_table}' "
+        f"  AND destination_table = '{iceberg_table}' "
+        f"  AND partition_id = '2020'"
+    ).strip()
+    assert rows_in_system_view == "0", (
+        f"Expected no row in system.replicated_partition_exports after a "
+        f"synchronously-rejected export, got {rows_in_system_view}."
+    )
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 0, (
+        f"Expected 0 rows in Iceberg table after rejected export, got {count}"
+    )
+
+
+def test_export_partition_column_count_mismatch_source_fewer_is_rejected(cluster):
+    """
+    Source has 2 columns (id, year), destination has 3 (id, year, extra).
+    Same expected synchronous rejection as the source>dest case.
+    """
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_count_fewer_{uid}"
+    iceberg_table = f"iceberg_count_fewer_{uid}"
+
+    make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
+    node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020)")
+
+    make_iceberg_s3(node, iceberg_table, "id Int64, year Int32, extra String",
+                    partition_by="year")
+
+    error = node.query_and_get_error(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
+    )
+    assert "NUMBER_OF_COLUMNS_DOESNT_MATCH" in error, (
+        f"Expected NUMBER_OF_COLUMNS_DOESNT_MATCH for source<dest column count, "
+        f"got: {error!r}"
+    )
+
+    rows_in_system_view = node.query(
+        f"SELECT count() FROM system.replicated_partition_exports "
+        f"WHERE source_table = '{mt_table}' "
+        f"  AND destination_table = '{iceberg_table}' "
+        f"  AND partition_id = '2020'"
+    ).strip()
+    assert rows_in_system_view == "0", (
+        f"Expected no row in system.replicated_partition_exports after a "
+        f"synchronously-rejected export, got {rows_in_system_view}."
+    )
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 0, (
+        f"Expected 0 rows in Iceberg table after rejected export, got {count}"
+    )
+
+
+def test_export_partition_with_renamed_destination_column(cluster):
+    """
+    Source has column `id`, destination has the same shape but the column is
+    named `renamed_id`.  Positional matching must accept the export and the
+    data must land in the destination under the new name.
+    """
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_renamed_{uid}"
+    iceberg_table = f"iceberg_renamed_{uid}"
+
+    make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
+    node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020), (3, 2020)")
+
+    make_iceberg_s3(node, iceberg_table, "renamed_id Int64, year Int32",
+                    partition_by="year")
+
+    node.query(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
+    )
+    wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}"
+
+    result = node.query(
+        f"SELECT renamed_id, year FROM {iceberg_table} ORDER BY renamed_id"
+    ).strip()
+    assert result == "1\t2020\n2\t2020\n3\t2020", (
+        f"Unexpected data under renamed column:\n{result}"
+    )
+
+
+def test_export_partition_with_castable_widening(cluster):
+    """
+    Source column is Int32, destination expects Int64.  makeConvertingActions
+    inserts a widening CAST; the export must succeed and the values must land
+    as Int64 in Iceberg.  The partition column `year` keeps the same Int32 type
+    on both sides so the exact-type partition check is satisfied.
+    """
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_widen_{uid}"
+    iceberg_table = f"iceberg_widen_{uid}"
+
+    make_rmt(node, mt_table, "id Int32, year Int32", "year", replica_name="replica1")
+    node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020)")
+
+    make_iceberg_s3(node, iceberg_table, "id Int64, year Int32", partition_by="year")
+
+    node.query(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
+    )
+    wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 2, f"Expected 2 rows in Iceberg table after export, got {count}"
+
+    result = node.query(
+        f"SELECT id, toTypeName(id), year FROM {iceberg_table} ORDER BY id"
+    ).strip()
+    assert result == "1\tInt64\t2020\n2\tInt64\t2020", (
+        f"Unexpected widened data:\n{result}"
+    )
+
+
+def test_export_partition_with_castable_narrowing_values_fit(cluster):
+    """
+    Source column is Int64, destination expects Int32 — lossy in general but
+    not blocked, mirroring INSERT SELECT.
+    """
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_narrow_fit_{uid}"
+    iceberg_table = f"iceberg_narrow_fit_{uid}"
+
+    make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
+    node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020)")
+
+    make_iceberg_s3(node, iceberg_table, "id Int32, year Int32", partition_by="year")
+
+    node.query(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
+    )
+    wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 2, f"Expected 2 rows in Iceberg table after export, got {count}"
+
+    result = node.query(
+        f"SELECT id, toTypeName(id), year FROM {iceberg_table} ORDER BY id"
+    ).strip()
+    assert result == "1\tInt32\t2020\n2\tInt32\t2020", (
+        f"Unexpected narrowed data:\n{result}"
+    )
+
+
+def test_export_partition_runtime_cast_failure_propagates_async(cluster):
+    """
+    Source has a String column whose value cannot be parsed as the destination
+    Int32 column.  Validation accepts the type pair (String -> Int32 is a
+    defined conversion) so the ALTER returns synchronously; the actual cast
+    runs in the async export worker and must fail there.  The failure exhausts
+    the retry budget and the export is marked FAILED in
+    system.replicated_partition_exports with a non-zero exception_count, while
+    the Iceberg table is left empty.
+
+    (Integer narrowing overflow is not used here because the internal cast
+    `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps
+    on overflow rather than throwing — that would still produce a successful
+    export with garbage data, not a propagated failure.)
+    """
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_runtime_cast_fail_{uid}"
+    iceberg_table = f"iceberg_runtime_cast_fail_{uid}"
+
+    make_rmt(node, mt_table, "id String, year Int32", "year", replica_name="replica1")
+    node.query(f"INSERT INTO {mt_table} VALUES ('not a number', 2020)")
+
+    make_iceberg_s3(node, iceberg_table, "id Int32, year Int32", partition_by="year")
+
+    node.query(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} "
+        f"SETTINGS export_merge_tree_partition_max_retries = 1, allow_insert_into_iceberg = 1"
+    )
+
+    wait_for_export_status(node, mt_table, iceberg_table, "2020", "FAILED", timeout=60)
+
+    exception_count = int(node.query(
+        f"SELECT any(exception_count) FROM system.replicated_partition_exports "
+        f"WHERE source_table = '{mt_table}' "
+        f"  AND destination_table = '{iceberg_table}' "
+        f"  AND partition_id = '2020'"
+    ).strip())
+    assert exception_count > 0, (
+        "Expected non-zero exception_count after a failed runtime cast"
+    )
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 0, (
+        f"Expected 0 rows in Iceberg table after failed export, got {count}"
+    )

From 245c0bb5a3c887b5d642d298435325be16aa8cfa Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 15 Jun 2026 14:20:41 -0300
Subject: [PATCH 06/33] yet another impl

---
 src/Core/Settings.cpp                         |  3 +
 src/Core/SettingsChangesHistory.cpp           |  1 +
 .../MergeTree/ExportPartitionUtils.cpp        | 89 +++++++++----------
 src/Storages/MergeTree/ExportPartitionUtils.h | 40 +++------
 src/Storages/MergeTree/MergeTreeData.cpp      | 41 ++-------
 src/Storages/StorageReplicatedMergeTree.cpp   | 43 ++-------
 .../test.py                                   | 61 +++----------
 .../test.py                                   | 59 +++---------
 .../test.py                                   | 45 ++--------
 ...rge_tree_part_to_object_storage_simple.sql | 35 ++++++--
 ...rge_tree_part_to_object_storage_simple.sql | 33 +++++--
 11 files changed, 157 insertions(+), 293 deletions(-)

diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index ba022db0f83a..6368ec01c1cb 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -7605,6 +7605,9 @@ Has no effect on `EXPORT PARTITION <id>` (single-partition export).
 )", 0) \
     DECLARE(String, export_merge_tree_part_filename_pattern, "{part_name}_{checksum}", R"(
 Pattern for the filename of the exported merge tree part. The `part_name` and `checksum` are calculated and replaced on the fly. Additional macros are supported.
+)", 0) \
+    DECLARE(Bool, export_merge_tree_part_allow_lossy_cast, false, R"(
+Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
 )", 0) \
     \
     /* ####################################################### */ \
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 50cc8c572561..3a72ec667415 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -43,6 +43,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
         {
             {"object_storage_cluster_join_mode", "allow", "allow", "New setting"},
             {"export_merge_tree_partition_task_timeout_seconds", "3600", "86400", "Increase default value to make it more realistic"},
+            {"export_merge_tree_part_allow_lossy_cast", false, false, "New setting to gate lossy casts in EXPORT PART/PARTITION behind explicit acknowledgment"},
         });
         addSettingsChanges(settings_changes_history, "26.3",
         {
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp
index f227d318a1fd..ef51b10b9c0a 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.cpp
+++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp
@@ -9,6 +9,10 @@
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <filesystem>
 #include <thread>
+#include <Core/Block.h>
+#include <Core/Settings.h>
+#include <DataTypes/Utils.h>
+#include <Interpreters/ActionsDAG.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/ExpressionActions.h>
 
@@ -38,6 +42,11 @@ namespace ErrorCodes
     extern const int NETWORK_ERROR;
 }
 
+namespace Setting
+{
+    extern const SettingsBool export_merge_tree_part_allow_lossy_cast;
+}
+
 namespace FailPoints
 {
     extern const char iceberg_export_after_commit_before_zk_completed[];
@@ -404,18 +413,8 @@ namespace ExportPartitionUtils
 #if USE_AVRO
     void verifyIcebergPartitionCompatibility(
         const Poco::JSON::Object::Ptr & metadata_object,
-        const ASTPtr & partition_key_ast,
-        const StorageMetadataPtr & source_metadata,
-        const StorageMetadataPtr & destination_metadata,
-        const StorageID & destination_storage_id)
+        const ASTPtr & partition_key_ast)
     {
-        /// The Iceberg manifest's partition values are derived from the SOURCE
-        /// part's minmax block (see `IcebergMetadata::commitExportPartitionTransaction`)
-        /// while the row data is CAST to the destination Iceberg schema by
-        /// `addExportConvertingActions`. Reject castable type differences on
-        /// partition-key columns up front so the two cannot disagree.
-        verifyPartitionColumnsExactTypeMatch(source_metadata, destination_metadata, destination_storage_id);
-
         const auto original_schema_id = metadata_object->getValue<Int64>(Iceberg::f_current_schema_id);
         const auto partition_spec_id  = metadata_object->getValue<Int64>(Iceberg::f_default_spec_id);
 
@@ -531,51 +530,47 @@ namespace ExportPartitionUtils
     }
 #endif
 
-    void verifyPartitionColumnsExactTypeMatch(
+    void verifyExportSchemaCastable(
         const StorageMetadataPtr & source_metadata,
         const StorageMetadataPtr & destination_metadata,
-        const StorageID & destination_storage_id)
+        const StorageID & destination_storage_id,
+        const ContextPtr & context)
     {
-        if (!source_metadata->hasPartitionKey())
-            return;
+        /// Build (and discard) the same converting DAG the export worker will build
+        /// later, to surface structural mismatches (column count, untyped casts) early.
+        Block source_sample_block;
+        for (const auto & column : source_metadata->getColumns().getReadable())
+            source_sample_block.insert({column.type->createColumn(), column.type, column.name});
+
+        const auto destination_sample_block = destination_metadata->getSampleBlockNonMaterialized();
 
-        /// Use the partition expression's required columns (the ones that flow
-        /// into the destination's partition strategy via
-        /// `minmax_idx->getBlock(storage)`), not the partition expression's
-        /// sample-block output columns.  This catches both identity
-        /// partitioning (`PARTITION BY year`) and computed partitioning
-        /// (`PARTITION BY toYearNumSinceEpoch(event_date)` — required column
-        /// is `event_date`).
-        const auto required_columns
-            = source_metadata->getPartitionKey().expression->getRequiredColumnsWithTypes();
+        const auto source_columns = source_sample_block.getColumnsWithTypeAndName();
+        const auto destination_columns = destination_sample_block.getColumnsWithTypeAndName();
 
-        const auto & destination_columns = destination_metadata->getColumns();
+        (void) ActionsDAG::makeConvertingActions(
+            source_columns,
+            destination_columns,
+            ActionsDAG::MatchColumnsMode::Position,
+            context);
 
-        for (const auto & source_column : required_columns)
+        /// Lossy casts may silently change values, so reject them unless the user opts in.
+        if (context->getSettingsRef()[Setting::export_merge_tree_part_allow_lossy_cast])
+            return;
+
+        const size_t num_columns = std::min(source_columns.size(), destination_columns.size());
+        for (size_t i = 0; i < num_columns; ++i)
         {
-            if (!destination_columns.has(source_column.name))
-                throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Cannot export partition: source partition key references column '{}' "
-                    "which does not exist on the destination table {}. "
-                    "EXPORT does not re-evaluate the partition key against the destination's schema, "
-                    "so every column the source partition expression depends on must be present "
-                    "on the destination with the same type.",
-                    source_column.name, destination_storage_id.getFullTableName());
-
-            const auto destination_type
-                = destination_columns.getPhysical(source_column.name).type;
-            if (!source_column.type->equals(*destination_type))
+            const auto & source_column = source_columns[i];
+            const auto & destination_column = destination_columns[i];
+            if (!canBeSafelyCast(source_column.type, destination_column.type))
                 throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Cannot export partition: partition key column '{}' has type {} on the source "
-                    "but {} on the destination table {}. "
-                    "EXPORT does not re-evaluate the partition key against the destination's schema, "
-                    "so castable type differences are not allowed for partition columns "
-                    "(the partition path and any Iceberg partition values are derived from the "
-                    "source-typed minmax block, while the row data is CAST to the destination type).",
-                    source_column.name,
+                    "Cannot export to {}: column '{}' requires a lossy cast from {} to {}, "
+                    "which may change values. Set `export_merge_tree_part_allow_lossy_cast = 1` "
+                    "to allow lossy casts during export.",
+                    destination_storage_id.getFullTableName(),
+                    destination_column.name,
                     source_column.type->getName(),
-                    destination_type->getName(),
-                    destination_storage_id.getFullTableName());
+                    destination_column.type->getName());
         }
     }
 }
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h
index 958a8a819457..d97bb538a47f 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.h
+++ b/src/Storages/MergeTree/ExportPartitionUtils.h
@@ -89,41 +89,23 @@ namespace ExportPartitionUtils
         const std::string & exception_message,
         const LoggerPtr & log);
 
-    /// EXPORT PART/PARTITION never re-evaluates the partition key against the
-    /// destination's schema: the partition path (object storage) and the
-    /// Iceberg partition values (data lake) are derived from the SOURCE part's
-    /// minmax block in ExportPartTask::executeStep. Therefore every column the
-    /// SOURCE partition expression depends on must have an identically-typed
-    /// counterpart on the destination — castable but non-equal types are not
-    /// safe here even though `makeConvertingActions` would accept them.
-    ///
-    /// Called directly from the non-data-lake branch of
-    /// `MergeTreeData::exportPartToTable` /
-    /// `StorageReplicatedMergeTree::exportPartitionToTable`, and indirectly
-    /// for the Iceberg branch via `verifyIcebergPartitionCompatibility` which
-    /// embeds this check.  Throws BAD_ARGUMENTS on missing column or type
-    /// mismatch.
-    void verifyPartitionColumnsExactTypeMatch(
+    /// Validates that source columns can be exported into the destination with the
+    /// same positional CAST matching as `INSERT INTO dest SELECT * FROM src`. Lossy
+    /// casts are rejected unless `export_merge_tree_part_allow_lossy_cast` is set.
+    /// Throws BAD_ARGUMENTS on any violation.
+    void verifyExportSchemaCastable(
         const StorageMetadataPtr & source_metadata,
         const StorageMetadataPtr & destination_metadata,
-        const StorageID & destination_storage_id);
+        const StorageID & destination_storage_id,
+        const ContextPtr & context);
 
 #if USE_AVRO
-    /// Verifies that the source MergeTree partition key is compatible with the
-    /// destination Iceberg partition spec by:
-    ///   * Comparing partition-field source-ids and transforms in order
-    ///     against the Iceberg spec derived from the MergeTree PARTITION BY AST.
-    ///   * Embedding `verifyPartitionColumnsExactTypeMatch` so that columns
-    ///     flowing through the partition key have identical ClickHouse types on
-    ///     both sides — the Iceberg manifest's partition values come from the
-    ///     SOURCE-typed minmax block and cannot be reconciled later.
-    /// Throws BAD_ARGUMENTS on any mismatch.
+    /// Verifies the source MergeTree partition key matches the destination Iceberg
+    /// partition spec (source-ids and transforms in order). Throws BAD_ARGUMENTS on
+    /// mismatch.
     void verifyIcebergPartitionCompatibility(
         const Poco::JSON::Object::Ptr & metadata_object,
-        const ASTPtr & partition_key_ast,
-        const StorageMetadataPtr & source_metadata,
-        const StorageMetadataPtr & destination_metadata,
-        const StorageID & destination_storage_id);
+        const ASTPtr & partition_key_ast);
 #endif
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index ee2b7d2250c8..937c675fab2f 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -6745,10 +6745,7 @@ void MergeTreeData::exportPartToTable(
 
             ExportPartitionUtils::verifyIcebergPartitionCompatibility(
                 metadata_object,
-                source_metadata_ptr->getPartitionKeyAST(),
-                source_metadata_ptr,
-                destination_metadata_ptr,
-                dest_storage->getStorageID());
+                source_metadata_ptr->getPartitionKeyAST());
         }
 #else
         (void)iceberg_metadata_json_;
@@ -6756,42 +6753,16 @@ void MergeTreeData::exportPartToTable(
 #endif
     }
 
-    /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` —
-    /// positional matching with CAST. We surface structural mismatches synchronously at
-    /// ALTER TABLE time by building (and discarding) the same converting DAG the worker
-    /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch,
-    /// untyped cast) fails here; anything it accepts will be CAST at runtime.
-    {
-        Block source_sample_block;
-        for (const auto & column : source_metadata_ptr->getColumns().getReadable())
-            source_sample_block.insert({column.type->createColumn(), column.type, column.name});
-
-        const auto destination_sample_block = destination_metadata_ptr->getSampleBlockNonMaterialized();
+    /// Positional CAST matching, like `INSERT INTO dest SELECT * FROM src`.
+    ExportPartitionUtils::verifyExportSchemaCastable(
+        source_metadata_ptr, destination_metadata_ptr, dest_storage->getStorageID(), query_context);
 
-        (void) ActionsDAG::makeConvertingActions(
-            source_sample_block.getColumnsWithTypeAndName(),
-            destination_sample_block.getColumnsWithTypeAndName(),
-            ActionsDAG::MatchColumnsMode::Position,
-            query_context);
-    }
-
-    /// For data lakes (Iceberg) the partition-key checks — both spec/transform
-    /// compatibility and ClickHouse partition-column exact-type equality — are
-    /// performed inside `verifyIcebergPartitionCompatibility` above.
-    /// For non-data-lake destinations we do the AST equivalence check here and
-    /// then run the partition-column exact-type check directly.
+    /// Iceberg partition compatibility is checked above; here we only need the
+    /// partition-key ASTs to match (partition-column types follow the lossy-cast gate).
     if (!dest_storage->isDataLake())
     {
         if (query_to_string(source_metadata_ptr->getPartitionKeyAST()) != query_to_string(destination_metadata_ptr->getPartitionKeyAST()))
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key");
-
-        /// EXPORT does not re-evaluate the partition key against the destination's schema:
-        /// the hive partition path is derived from the SOURCE part's minmax block in
-        /// `ExportPartTask::executeStep`, while the row data is CAST to destination types
-        /// by `addExportConvertingActions`. Reject castable type differences on
-        /// partition-key columns synchronously so the path and the row data cannot disagree.
-        ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch(
-            source_metadata_ptr, destination_metadata_ptr, dest_storage->getStorageID());
     }
 
     auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated});
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index cb576f137546..2301ec1c1a30 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -8388,42 +8388,16 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
     auto src_snapshot = getInMemoryMetadataPtr();
     auto destination_snapshot = dest_storage->getInMemoryMetadataPtr();
 
-    /// Schema compatibility: use the same rules as `INSERT INTO dest SELECT * FROM src` —
-    /// positional matching with CAST. We surface structural mismatches synchronously at
-    /// ALTER TABLE time by building (and discarding) the same converting DAG the worker
-    /// will build later. Anything `makeConvertingActions` rejects (column-count mismatch,
-    /// untyped cast) fails here; anything it accepts will be CAST at runtime.
-    {
-        Block source_sample_block;
-        for (const auto & column : src_snapshot->getColumns().getReadable())
-            source_sample_block.insert({column.type->createColumn(), column.type, column.name});
-
-        const auto destination_sample_block = destination_snapshot->getSampleBlockNonMaterialized();
-
-        (void) ActionsDAG::makeConvertingActions(
-            source_sample_block.getColumnsWithTypeAndName(),
-            destination_sample_block.getColumnsWithTypeAndName(),
-            ActionsDAG::MatchColumnsMode::Position,
-            query_context);
-    }
-
-    /// For data lakes (Iceberg) the partition-key checks — both spec/transform
-    /// compatibility and ClickHouse partition-column exact-type equality — are
-    /// performed inside `verifyIcebergPartitionCompatibility` below (per partition).
-    /// For non-data-lake destinations we do the AST equivalence check here and
-    /// then run the partition-column exact-type check directly.
+    /// Positional CAST matching, like `INSERT INTO dest SELECT * FROM src`.
+    ExportPartitionUtils::verifyExportSchemaCastable(
+        src_snapshot, destination_snapshot, dest_storage->getStorageID(), query_context);
+
+    /// Iceberg partition compatibility is checked below; here we only need the
+    /// partition-key ASTs to match (partition-column types follow the lossy-cast gate).
     if (!dest_storage->isDataLake())
     {
         if (query_to_string(src_snapshot->getPartitionKeyAST()) != query_to_string(destination_snapshot->getPartitionKeyAST()))
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key");
-
-        /// EXPORT does not re-evaluate the partition key against the destination's schema:
-        /// the hive partition path is derived from the SOURCE part's minmax block in
-        /// `ExportPartTask::executeStep`, while the row data is CAST to destination types
-        /// by `addExportConvertingActions`. Reject castable type differences on
-        /// partition-key columns synchronously so the path and the row data cannot disagree.
-        ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch(
-            src_snapshot, destination_snapshot, dest_storage->getStorageID());
     }
 
     zkutil::ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly();
@@ -8597,10 +8571,7 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
 
         ExportPartitionUtils::verifyIcebergPartitionCompatibility(
             metadata_object,
-            src_snapshot->getPartitionKeyAST(),
-            src_snapshot,
-            destination_snapshot,
-            dest_storage->getStorageID());
+            src_snapshot->getPartitionKeyAST());
 
         std::ostringstream oss;     // STYLE_CHECK_ALLOW_STD_STRING_STREAM
         oss.exceptions(std::ios::failbit);
diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
index 43ba24116526..f51dfa3c602e 100644
--- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
+++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
@@ -385,28 +385,12 @@ def test_export_part_with_year_transform_partition(cluster):
     node.query(f"DROP TABLE IF EXISTS {iceberg}")
 
 
-def test_export_part_partition_column_type_mismatch_is_rejected(cluster):
-    """
-    EXPORT PART must synchronously reject (BAD_ARGUMENTS) when a partition
-    column has different (even castable) types on source and destination.
-
-    The partition path / Iceberg partition values are computed from the
-    SOURCE part's minmax block in ExportPartTask::executeStep before
-    addExportConvertingActions casts the data, so a castable mismatch on
-    a partition column would let the path/value computation and the row
-    data disagree. The check in
-    ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch rejects this
-    at ALTER time.
-
-    Failing case: source `year Int32` -> destination `year Int64`, both
-    PARTITION BY year (identity transform). Both Int32 and Int64 are valid
-    Iceberg primitives, so verifyIcebergPartitionCompatibility happily
-    accepts the spec — only the exact-type check rejects.
-    """
+def test_export_part_partition_column_lossless_widening(cluster):
+    """A lossless widening of a partition column (year Int32 -> Int64) round-trips."""
     node = cluster.instances["node1"]
     sfx = unique_suffix()
-    mt = f"mt_pcol_type_mismatch_{sfx}"
-    iceberg = f"iceberg_pcol_type_mismatch_{sfx}"
+    mt = f"mt_pcol_widening_{sfx}"
+    iceberg = f"iceberg_pcol_widening_{sfx}"
 
     make_mt(node, mt, "id Int32, year Int32", "year")
     make_iceberg_s3(node, iceberg, "id Int32, year Int64", "year")
@@ -414,39 +398,20 @@ def test_export_part_partition_column_type_mismatch_is_rejected(cluster):
     node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020), (3, 2020)")
 
     part_2020 = get_part(node, mt, "2020")
+    export_part(node, mt, part_2020, iceberg)
+    wait_for_export_part(node, mt, part_2020)
 
-    error = node.query_and_get_error(
-        f"ALTER TABLE {mt} EXPORT PART '{part_2020}' TO TABLE {iceberg} "
-        f"SETTINGS allow_experimental_export_merge_tree_part = 1, "
-        f"allow_experimental_insert_into_iceberg = 1"
-    )
-    assert "BAD_ARGUMENTS" in error, (
-        f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
-        f"got: {error!r}"
-    )
-    assert "partition key column 'year'" in error, (
-        f"Expected the error message to identify the offending partition "
-        f"column 'year', got: {error!r}"
-    )
+    count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
+    assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}"
 
-    # No async work scheduled: no ExportPart row in system.part_log.
-    node.query("SYSTEM FLUSH LOGS")
-    part_log_count = node.query(
-        f"SELECT count() FROM system.part_log "
-        f"WHERE event_type = 'ExportPart' "
-        f"  AND database = currentDatabase() "
-        f"  AND table = '{mt}' "
-        f"  AND part_name = '{part_2020}'"
+    result = node.query(
+        f"SELECT id, toTypeName(year), year FROM {iceberg} ORDER BY id"
     ).strip()
-    assert part_log_count == "0", (
-        f"Expected no ExportPart entries in system.part_log after a "
-        f"synchronously-rejected export, found {part_log_count}."
+    assert result == "1\tInt64\t2020\n2\tInt64\t2020\n3\tInt64\t2020", (
+        f"Unexpected widened partition-column data:\n{result}"
     )
 
-    count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
-    assert count == 0, (
-        f"Expected 0 rows in Iceberg table after rejected export, got {count}"
-    )
+    assert_part_log(node, mt, part_2020)
 
     node.query(f"DROP TABLE IF EXISTS {mt} SYNC")
     node.query(f"DROP TABLE IF EXISTS {iceberg}")
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 45aa565b3586..6bd080bc72f0 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -746,30 +746,13 @@ def test_partition_key_compatibility_check(cluster):
     )
 
 
-def test_export_partition_partition_column_type_mismatch_is_rejected(cluster):
-    """
-    EXPORT PARTITION must synchronously reject (BAD_ARGUMENTS) when a
-    partition column has a different (even castable) type on the source
-    MergeTree and the destination Iceberg table.
-
-    The Iceberg destination's partition path is derived from the SOURCE
-    part's minmax block in ExportPartTask::executeStep, and the Iceberg
-    manifest's partition values come from the SOURCE-typed
-    `manifest.data_part->partition.value` Fields. Letting a castable type
-    mismatch through would mean the manifest/path computation and the row
-    data (which IS cast by addExportConvertingActions) disagree — not
-    equivalent to INSERT SELECT.
-
-    Failing case: source `year Int32` -> destination `year Int64`, both
-    PARTITION BY year (identity transform). Both are valid Iceberg primitives
-    so verifyIcebergPartitionCompatibility accepts the spec; only the new
-    exact-type check rejects.
-    """
+def test_export_partition_partition_column_lossless_widening(cluster):
+    """A lossless widening of a partition column (year Int32 -> Int64) round-trips."""
     node = cluster.instances["replica1"]
 
     uid = unique_suffix()
-    mt_table = f"mt_pcol_type_mismatch_{uid}"
-    iceberg_table = f"iceberg_pcol_type_mismatch_{uid}"
+    mt_table = f"mt_pcol_widening_{uid}"
+    iceberg_table = f"iceberg_pcol_widening_{uid}"
 
     # Source uses year Int32; destination Iceberg uses year Int64.
     make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
@@ -780,36 +763,20 @@ def test_export_partition_partition_column_type_mismatch_is_rejected(cluster):
         partition_by="year", s3_retry_attempts=3,
     )
 
-    error = node.query_and_get_error(
+    node.query(
         f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
         settings={"allow_insert_into_iceberg": 1},
     )
-    assert "BAD_ARGUMENTS" in error, (
-        f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
-        f"got: {error!r}"
-    )
-    assert "partition key column 'year'" in error, (
-        f"Expected the error message to identify the offending partition "
-        f"column 'year', got: {error!r}"
-    )
-
-    # No async work scheduled.
-    rows_in_system_view = node.query(
-        f"SELECT count() FROM system.replicated_partition_exports "
-        f"WHERE source_table = '{mt_table}' "
-        f"  AND destination_table = '{iceberg_table}' "
-        f"  AND partition_id = '2020'"
-    ).strip()
-    assert rows_in_system_view == "0", (
-        f"Expected no row in system.replicated_partition_exports after a "
-        f"synchronously-rejected export, got {rows_in_system_view}."
-    )
+    wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
 
-    # Destination must remain empty.
     count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
-    assert count == 0, (
-        f"Expected 0 rows in Iceberg destination after rejected export, "
-        f"got {count}"
+    assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}"
+
+    result = node.query(
+        f"SELECT id, toTypeName(year), year FROM {iceberg_table} ORDER BY id"
+    ).strip()
+    assert result == "1\tInt64\t2020\n2\tInt64\t2020\n3\tInt64\t2020", (
+        f"Unexpected widened partition-column data:\n{result}"
     )
 
 
diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
index 5c3e656107bc..d403538999df 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
@@ -1533,41 +1533,8 @@ def test_export_partition_all(cluster):
 
 
 def test_export_partition_partition_column_castable_type_mismatch(cluster):
-    """
-    Post-fix contract: EXPORT PARTITION must reject castable but non-equal
-    partition-column types synchronously.
-
-    Setup
-    -----
-        * Source (ReplicatedMergeTree): ``id UInt64, year String``,
-          ``PARTITION BY year``
-        * Destination (S3, partition_strategy='hive'): ``id UInt64,
-          year UInt16``, ``PARTITION BY year``
-
-    Why the synchronous rejection is required
-    -----------------------------------------
-    ``makeConvertingActions(Position)`` accepts ``String -> UInt16`` and the
-    PARTITION BY ASTs are textually identical, so the schema/AST validators
-    would let this through.  But EXPORT does not re-evaluate the partition
-    key against the destination's schema: in ``ExportPartTask::executeStep``
-    the block fed to ``destination_storage->import`` is built from the
-    source part's minmax index (SOURCE types).  If we let the export proceed
-    the hive partition path is computed from the un-cast SOURCE value while
-    the row data is the CAST value — silent divergence from INSERT SELECT.
-    The exact-type partition-column check in
-    ``ExportPartitionUtils::verifyPartitionColumnsExactTypeMatch`` rejects
-    this pairing at ALTER time.
-
-    Assertions
-    ----------
-        * ``ALTER ... EXPORT PARTITION ID '<id>' TO TABLE ...`` raises a
-          ``BAD_ARGUMENTS`` exception whose text references the partition
-          column type mismatch.
-        * No row is created in ``system.replicated_partition_exports`` for
-          this (source_table, destination_table, partition_id) — nothing
-          was ever scheduled.
-        * No parquet file is written under any ``year=*/`` prefix in S3.
-    """
+    """A lossy partition-column cast (year String -> UInt16) is rejected synchronously
+    when export_merge_tree_part_allow_lossy_cast is off, scheduling nothing."""
     skip_if_remote_database_disk_enabled(cluster)
     node = cluster.instances["replica1"]
 
@@ -1614,12 +1581,12 @@ def test_export_partition_partition_column_castable_type_mismatch(cluster):
         f"TO TABLE {s3_table}"
     )
     assert "BAD_ARGUMENTS" in error, (
-        f"Expected BAD_ARGUMENTS for partition-column type mismatch, "
+        f"Expected BAD_ARGUMENTS for a lossy partition-column cast, "
         f"got: {error!r}"
     )
-    assert "partition key column 'year'" in error, (
-        f"Expected the error message to identify the offending partition "
-        f"column 'year', got: {error!r}"
+    assert "requires a lossy cast" in error and "'year'" in error, (
+        f"Expected the error message to report the lossy cast on column "
+        f"'year', got: {error!r}"
     )
 
     # Nothing scheduled: no row in system.replicated_partition_exports.
diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
index 9357f4e5aa78..e40c5ebde461 100644
--- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
@@ -1,6 +1,6 @@
 -- Tags: no-parallel, no-fasttest
 
-DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3;
+DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3, 03572_lossy_mt, 03572_lossy_s3, 03572_lossless_mt, 03572_lossless_s3;
 
 SET allow_experimental_export_merge_tree_part=1;
 
@@ -39,16 +39,37 @@ INSERT INTO 03572_ephemeral_mt_table (id, year, name) VALUES (1, 2020, 'alice');
 
 ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_matching_ephemeral_s3_table; -- {serverError NUMBER_OF_COLUMNS_DOESNT_MATCH}
 
--- Castable but non-equal partition-column types must be rejected synchronously.
--- The partition path is computed from the source part's minmax block (SOURCE types)
--- BEFORE row data is CAST to destination types — letting String -> UInt16 through
--- would cause silent divergence between partition path and row values.
+-- Partition columns follow the same lossy-cast gate as any other column (no special
+-- exact-type guard). String -> UInt16 is a lossy cast, so with the default
+-- export_merge_tree_part_allow_lossy_cast = 0 it is rejected synchronously.
 CREATE TABLE 03572_partition_type_mismatch_mt (id UInt64, year String) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
 CREATE TABLE 03572_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
 
--- Partition-column type mismatch is checked synchronously BEFORE the part lookup,
+-- The lossy-cast check fires synchronously BEFORE the part lookup,
 -- so we don't need to insert data nor use a real part name to provoke it.
 ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_partition_type_mismatch_s3
 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
 
-DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3;
+-- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values.
+-- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a
+-- fake part name is enough to provoke it.
+CREATE TABLE 03572_lossy_mt (id Int64, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
+
+-- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation
+-- passes and execution proceeds to the part lookup, which fails on the fake part name.
+ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART}
+
+-- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema
+-- validation passes and we reach the part lookup, which fails on the fake part name.
+CREATE TABLE 03572_lossless_mt (id Int32, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+ALTER TABLE 03572_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossless_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART}
+
+DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3, 03572_lossy_mt, 03572_lossy_s3, 03572_lossless_mt, 03572_lossless_s3;
diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
index 874b5f288cf7..e20fc14a26a5 100644
--- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
@@ -1,6 +1,6 @@
 -- Tags: no-parallel, no-fasttest
 
-DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3;
+DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3, 03572_rmt_lossy_mt, 03572_rmt_lossy_s3, 03572_rmt_lossless_mt, 03572_rmt_lossless_s3;
 
 CREATE TABLE 03572_rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt/03572_rmt_table', 'replica1') PARTITION BY year ORDER BY tuple();
 
@@ -21,14 +21,35 @@ CREATE TABLE 03572_invalid_schema_table (id UInt64, year UInt16) ENGINE = S3(s3_
 
 ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NOT_IMPLEMENTED}
 
--- Castable but non-equal partition-column types must be rejected synchronously.
--- The partition path is computed from the source part's minmax block (SOURCE types)
--- BEFORE row data is CAST to destination types — letting String -> UInt16 through
--- would cause silent divergence between partition path and row values.
+-- Partition columns follow the same lossy-cast gate as any other column (no special
+-- exact-type guard). String -> UInt16 is a lossy cast, so with the default
+-- export_merge_tree_part_allow_lossy_cast = 0 it is rejected synchronously.
 CREATE TABLE 03572_rmt_partition_type_mismatch_mt (id UInt64, year String) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_pcol_type/03572_rmt_partition_type_mismatch_mt', 'replica1') PARTITION BY year ORDER BY tuple();
 CREATE TABLE 03572_rmt_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
 
 ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_partition_type_mismatch_s3
 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
 
-DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3;
+-- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values.
+-- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a
+-- fake part name is enough to provoke it.
+CREATE TABLE 03572_rmt_lossy_mt (id Int64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossy/03572_rmt_lossy_mt', 'replica1') PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_rmt_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
+
+-- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation
+-- passes and execution proceeds to the part lookup, which fails on the fake part name.
+ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART}
+
+-- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema
+-- validation passes and we reach the part lookup, which fails on the fake part name.
+CREATE TABLE 03572_rmt_lossless_mt (id Int32, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossless/03572_rmt_lossless_mt', 'replica1') PARTITION BY year ORDER BY tuple();
+CREATE TABLE 03572_rmt_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
+
+ALTER TABLE 03572_rmt_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossless_s3
+SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART}
+
+DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3, 03572_rmt_lossy_mt, 03572_rmt_lossy_s3, 03572_rmt_lossless_mt, 03572_rmt_lossless_s3;

From 8d2b0862604dd7fca778d8de5a598fbe92d9ce5f Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 15 Jun 2026 14:42:27 -0300
Subject: [PATCH 07/33] fix tests and background setting

---
 .../MergeTree/ExportPartitionUtils.cpp        |  3 ++
 .../test.py                                   | 47 +++++++++----------
 .../test.py                                   | 41 +++++++---------
 3 files changed, 41 insertions(+), 50 deletions(-)

diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp
index ef51b10b9c0a..c902a4042218 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.cpp
+++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp
@@ -102,6 +102,9 @@ namespace ExportPartitionUtils
         /// stalls when the setting is only set at the query level.
         context_copy->setSetting("allow_insert_into_iceberg", true);
 
+        /// This setting, just like allow_insert_into_iceberg, has been verified at request time. At this point, we allow everything.
+        context_copy->setSetting("export_merge_tree_part_allow_lossy_cast", true);
+
 	    return context_copy;
     }
 
diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
index f51dfa3c602e..4a8514ef3ae1 100644
--- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
+++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
@@ -69,11 +69,15 @@ def get_part(node, table: str, partition_id: str) -> str:
     ).strip()
 
 
-def export_part(node, table: str, part: str, dest: str) -> None:
+def export_part(node, table: str, part: str, dest: str, extra_settings: str = "") -> None:
+    settings = (
+        "allow_experimental_export_merge_tree_part = 1, "
+        "allow_experimental_insert_into_iceberg = 1"
+    )
+    if extra_settings:
+        settings += ", " + extra_settings
     node.query(
-        f"ALTER TABLE {table} EXPORT PART '{part}' TO TABLE {dest} "
-        f"SETTINGS allow_experimental_export_merge_tree_part = 1, "
-        f"allow_experimental_insert_into_iceberg = 1"
+        f"ALTER TABLE {table} EXPORT PART '{part}' TO TABLE {dest} SETTINGS {settings}"
     )
 
 
@@ -560,10 +564,8 @@ def test_export_part_writes_column_statistics(cluster):
 #     synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing lands.
 #   * Destination column names need not match source names — positional pairs
 #     are CAST element-wise.
-#   * Castable type differences are accepted at validation time (widening and
-#     even lossy narrowing).  Lossiness only matters at runtime: if the value
-#     does not fit the destination type, the async export worker fails and the
-#     failure surfaces via system.part_log; the Iceberg table is untouched.
+#   * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at
+#     validation time unless export_merge_tree_part_allow_lossy_cast = 1.
 # ---------------------------------------------------------------------------
 
 
@@ -701,10 +703,8 @@ def test_export_part_with_castable_widening(cluster):
 
 
 def test_export_part_with_castable_narrowing_values_fit(cluster):
-    """
-    Source column is Int64, destination expects Int32 — lossy in general but
-    not blocked, mirroring INSERT SELECT.
-    """
+    """A lossy narrowing (id Int64 -> Int32) succeeds once the user opts in via
+    export_merge_tree_part_allow_lossy_cast."""
     node = cluster.instances["node1"]
     sfx = unique_suffix()
     mt = f"mt_narrow_fit_{sfx}"
@@ -716,7 +716,7 @@ def test_export_part_with_castable_narrowing_values_fit(cluster):
     node.query(f"INSERT INTO {mt} VALUES (1, 2020), (2, 2020)")
     part_2020 = get_part(node, mt, "2020")
 
-    export_part(node, mt, part_2020, iceberg)
+    export_part(node, mt, part_2020, iceberg, "export_merge_tree_part_allow_lossy_cast = 1")
     wait_for_export_part(node, mt, part_2020)
 
     count = int(node.query(f"SELECT count() FROM {iceberg}").strip())
@@ -736,18 +736,13 @@ def test_export_part_with_castable_narrowing_values_fit(cluster):
 
 
 def test_export_part_runtime_cast_failure_propagates_async(cluster):
-    """
-    Source has a String column whose value cannot be parsed as the destination
-    Int32 column.  Validation accepts the type pair (String -> Int32 is a
-    defined conversion) so the ALTER returns synchronously; the actual cast
-    runs in the async export worker and must fail there.  The failure is
-    propagated to system.part_log (non-zero `error` column with a non-empty
-    `exception` text) and the Iceberg table is left empty.
-
-    (Integer narrowing overflow is not used here because the internal cast
-    `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps
-    on overflow rather than throwing — that would still produce a successful
-    export with garbage data, not a propagated failure.)
+    """A String value that cannot be parsed as the destination Int32 passes the
+    synchronous lossy-cast gate (with export_merge_tree_part_allow_lossy_cast = 1) but
+    fails at runtime in the async worker; the failure surfaces in system.part_log and
+    Iceberg is left empty.
+
+    (Integer overflow is not used because the internal cast uses CastType::nonAccurate,
+    which wraps rather than throwing.)
     """
     node = cluster.instances["node1"]
     sfx = unique_suffix()
@@ -760,7 +755,7 @@ def test_export_part_runtime_cast_failure_propagates_async(cluster):
     node.query(f"INSERT INTO {mt} VALUES ('not a number', 2020)")
     part_2020 = get_part(node, mt, "2020")
 
-    export_part(node, mt, part_2020, iceberg)
+    export_part(node, mt, part_2020, iceberg, "export_merge_tree_part_allow_lossy_cast = 1")
 
     exception = wait_for_failed_export_part(node, mt, part_2020)
     assert exception, (
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 6bd080bc72f0..8e3f055c4317 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -1033,11 +1033,8 @@ def test_export_partition_writes_column_statistics(cluster):
 #     synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing is scheduled.
 #   * Destination column names need not match source names — positional pairs
 #     are CAST element-wise.
-#   * Castable type differences are accepted at validation time (widening and
-#     even lossy narrowing).  Lossiness only matters at runtime: if the value
-#     does not fit the destination type, the async export worker fails and the
-#     export is marked FAILED in system.replicated_partition_exports; the Iceberg
-#     table is left untouched.
+#   * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at
+#     validation time unless export_merge_tree_part_allow_lossy_cast = 1.
 # ---------------------------------------------------------------------------
 
 
@@ -1200,10 +1197,8 @@ def test_export_partition_with_castable_widening(cluster):
 
 
 def test_export_partition_with_castable_narrowing_values_fit(cluster):
-    """
-    Source column is Int64, destination expects Int32 — lossy in general but
-    not blocked, mirroring INSERT SELECT.
-    """
+    """A lossy narrowing (id Int64 -> Int32) succeeds once the user opts in via
+    export_merge_tree_part_allow_lossy_cast."""
     node = cluster.instances["replica1"]
 
     uid = unique_suffix()
@@ -1217,7 +1212,10 @@ def test_export_partition_with_castable_narrowing_values_fit(cluster):
 
     node.query(
         f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
-        settings={"allow_insert_into_iceberg": 1},
+        settings={
+            "allow_insert_into_iceberg": 1,
+            "export_merge_tree_part_allow_lossy_cast": 1,
+        },
     )
     wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
 
@@ -1233,19 +1231,13 @@ def test_export_partition_with_castable_narrowing_values_fit(cluster):
 
 
 def test_export_partition_runtime_cast_failure_propagates_async(cluster):
-    """
-    Source has a String column whose value cannot be parsed as the destination
-    Int32 column.  Validation accepts the type pair (String -> Int32 is a
-    defined conversion) so the ALTER returns synchronously; the actual cast
-    runs in the async export worker and must fail there.  The failure exhausts
-    the retry budget and the export is marked FAILED in
-    system.replicated_partition_exports with a non-zero exception_count, while
-    the Iceberg table is left empty.
-
-    (Integer narrowing overflow is not used here because the internal cast
-    `makeConvertingActions` builds uses `CastType::nonAccurate`, which wraps
-    on overflow rather than throwing — that would still produce a successful
-    export with garbage data, not a propagated failure.)
+    """A String value that cannot be parsed as the destination Int32 passes the
+    synchronous lossy-cast gate (with export_merge_tree_part_allow_lossy_cast = 1) but
+    fails at runtime in the async worker, marking the export FAILED and leaving Iceberg
+    empty.
+
+    (Integer overflow is not used because the internal cast uses CastType::nonAccurate,
+    which wraps rather than throwing.)
     """
     node = cluster.instances["replica1"]
 
@@ -1260,7 +1252,8 @@ def test_export_partition_runtime_cast_failure_propagates_async(cluster):
 
     node.query(
         f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} "
-        f"SETTINGS export_merge_tree_partition_max_retries = 1, allow_insert_into_iceberg = 1"
+        f"SETTINGS export_merge_tree_partition_max_retries = 1, allow_insert_into_iceberg = 1, "
+        f"export_merge_tree_part_allow_lossy_cast = 1"
     )
 
     wait_for_export_status(node, mt_table, iceberg_table, "2020", "FAILED", timeout=60)

From 4100ecd322666697c5c6685ded4e53f1588091f9 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 15 Jun 2026 15:45:22 -0300
Subject: [PATCH 08/33] fix tests

---
 ...rge_tree_part_to_object_storage_simple.sql | 21 +++++++------------
 ...rge_tree_part_to_object_storage_simple.sql | 20 ++++++++----------
 2 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
index e40c5ebde461..8200233a7322 100644
--- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql
@@ -45,31 +45,26 @@ ALTER TABLE 03572_ephemeral_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_mat
 CREATE TABLE 03572_partition_type_mismatch_mt (id UInt64, year String) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
 CREATE TABLE 03572_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
 
--- The lossy-cast check fires synchronously BEFORE the part lookup,
--- so we don't need to insert data nor use a real part name to provoke it.
-ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_partition_type_mismatch_s3
+ALTER TABLE 03572_partition_type_mismatch_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_partition_type_mismatch_s3
 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
 
--- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values.
--- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a
--- fake part name is enough to provoke it.
 CREATE TABLE 03572_lossy_mt (id Int64, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
 CREATE TABLE 03572_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
 
-ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3
+ALTER TABLE 03572_lossy_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_lossy_s3
 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
 
--- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation
--- passes and execution proceeds to the part lookup, which fails on the fake part name.
-ALTER TABLE 03572_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossy_s3
+-- With the acknowledgment setting enabled, the lossy cast passes validation and reaches the
+-- part lookup, which fails because the part does not exist.
+ALTER TABLE 03572_lossy_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_lossy_s3
 SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART}
 
--- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema
--- validation passes and we reach the part lookup, which fails on the fake part name.
+-- A lossless widening cast (Int32 -> Int64) passes validation without the setting and reaches
+-- the part lookup, which fails because the part does not exist.
 CREATE TABLE 03572_lossless_mt (id Int32, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple();
 CREATE TABLE 03572_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
 
-ALTER TABLE 03572_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_lossless_s3
+ALTER TABLE 03572_lossless_mt EXPORT PART '2020_1_1_0' TO TABLE 03572_lossless_s3
 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART}
 
 DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table, 03572_ephemeral_mt_table, 03572_matching_ephemeral_s3_table, 03572_partition_type_mismatch_mt, 03572_partition_type_mismatch_s3, 03572_lossy_mt, 03572_lossy_s3, 03572_lossless_mt, 03572_lossless_s3;
diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
index e20fc14a26a5..0a199755c40a 100644
--- a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
+++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql
@@ -27,29 +27,27 @@ ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_sche
 CREATE TABLE 03572_rmt_partition_type_mismatch_mt (id UInt64, year String) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_pcol_type/03572_rmt_partition_type_mismatch_mt', 'replica1') PARTITION BY year ORDER BY tuple();
 CREATE TABLE 03572_rmt_partition_type_mismatch_s3 (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_partition_type_mismatch_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
 
-ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_partition_type_mismatch_s3
+ALTER TABLE 03572_rmt_partition_type_mismatch_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_partition_type_mismatch_s3
 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
 
--- A lossy cast on a non-partition column (Int64 -> Int32) may silently change values.
--- It is rejected synchronously by default. The check fires BEFORE the part lookup, so a
--- fake part name is enough to provoke it.
+-- A lossy cast on a non-partition column (Int64 -> Int32) is rejected synchronously by default.
 CREATE TABLE 03572_rmt_lossy_mt (id Int64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossy/03572_rmt_lossy_mt', 'replica1') PARTITION BY year ORDER BY tuple();
 CREATE TABLE 03572_rmt_lossy_s3 (id Int32, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossy_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
 
-ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3
+ALTER TABLE 03572_rmt_lossy_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_lossy_s3
 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError BAD_ARGUMENTS}
 
--- With the acknowledgment setting enabled, the same lossy cast is allowed: schema validation
--- passes and execution proceeds to the part lookup, which fails on the fake part name.
-ALTER TABLE 03572_rmt_lossy_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossy_s3
+-- With the acknowledgment setting enabled, the lossy cast passes validation and reaches the
+-- part lookup, which fails because the part does not exist.
+ALTER TABLE 03572_rmt_lossy_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_lossy_s3
 SETTINGS allow_experimental_export_merge_tree_part = 1, export_merge_tree_part_allow_lossy_cast = 1; -- {serverError NO_SUCH_DATA_PART}
 
--- A lossless widening cast (Int32 -> Int64) is always allowed without the setting: schema
--- validation passes and we reach the part lookup, which fails on the fake part name.
+-- A lossless widening cast (Int32 -> Int64) passes validation without the setting and reaches
+-- the part lookup, which fails because the part does not exist.
 CREATE TABLE 03572_rmt_lossless_mt (id Int32, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt_lossless/03572_rmt_lossless_mt', 'replica1') PARTITION BY year ORDER BY tuple();
 CREATE TABLE 03572_rmt_lossless_s3 (id Int64, year UInt16) ENGINE = S3(s3_conn, filename='03572_rmt_lossless_s3', format='Parquet', partition_strategy='hive') PARTITION BY year;
 
-ALTER TABLE 03572_rmt_lossless_mt EXPORT PART 'any_part_name' TO TABLE 03572_rmt_lossless_s3
+ALTER TABLE 03572_rmt_lossless_mt EXPORT PART '2020_0_0_0' TO TABLE 03572_rmt_lossless_s3
 SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NO_SUCH_DATA_PART}
 
 DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table, 03572_rmt_partition_type_mismatch_mt, 03572_rmt_partition_type_mismatch_s3, 03572_rmt_lossy_mt, 03572_rmt_lossy_s3, 03572_rmt_lossless_mt, 03572_rmt_lossless_s3;

From 914de7283facdfb4c456807b26403ccd76c1a578 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 15 Jun 2026 17:51:54 -0300
Subject: [PATCH 09/33] some more test fixes

---
 .../test.py                                   |  16 --
 .../test.py                                   | 246 +++++++++++++-----
 2 files changed, 185 insertions(+), 77 deletions(-)

diff --git a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
index 4a8514ef3ae1..486adf1f2b17 100644
--- a/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
+++ b/tests/integration/test_export_merge_tree_part_to_iceberg/test.py
@@ -553,22 +553,6 @@ def test_export_part_writes_column_statistics(cluster):
     node.query(f"DROP TABLE IF EXISTS {iceberg}")
 
 
-# ---------------------------------------------------------------------------
-# Schema compatibility (INSERT SELECT mirror)
-#
-# EXPORT PART builds an ActionsDAG via makeConvertingActions(Position) — the
-# same primitive that powers INSERT INTO dest SELECT * FROM src.  The tests
-# below pin that contract:
-#
-#   * Column counts must match positionally; otherwise the ALTER is rejected
-#     synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing lands.
-#   * Destination column names need not match source names — positional pairs
-#     are CAST element-wise.
-#   * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at
-#     validation time unless export_merge_tree_part_allow_lossy_cast = 1.
-# ---------------------------------------------------------------------------
-
-
 def test_export_part_column_count_mismatch_source_more_is_rejected(cluster):
     """
     Source has 3 columns (id, year, extra), destination has 2 (id, year).
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 8e3f055c4317..09cf1c2ff467 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -746,40 +746,6 @@ def test_partition_key_compatibility_check(cluster):
     )
 
 
-def test_export_partition_partition_column_lossless_widening(cluster):
-    """A lossless widening of a partition column (year Int32 -> Int64) round-trips."""
-    node = cluster.instances["replica1"]
-
-    uid = unique_suffix()
-    mt_table = f"mt_pcol_widening_{uid}"
-    iceberg_table = f"iceberg_pcol_widening_{uid}"
-
-    # Source uses year Int32; destination Iceberg uses year Int64.
-    make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
-    node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020), (3, 2020)")
-
-    make_iceberg_s3(
-        node, iceberg_table, "id Int64, year Int64",
-        partition_by="year", s3_retry_attempts=3,
-    )
-
-    node.query(
-        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
-        settings={"allow_insert_into_iceberg": 1},
-    )
-    wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
-
-    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
-    assert count == 3, f"Expected 3 rows in Iceberg table after export, got {count}"
-
-    result = node.query(
-        f"SELECT id, toTypeName(year), year FROM {iceberg_table} ORDER BY id"
-    ).strip()
-    assert result == "1\tInt64\t2020\n2\tInt64\t2020\n3\tInt64\t2020", (
-        f"Unexpected widened partition-column data:\n{result}"
-    )
-
-
 def test_export_ttl(cluster):
     """
     After a manifest TTL expires the same partition can be re-exported, and the
@@ -1020,24 +986,6 @@ def test_export_partition_writes_column_statistics(cluster):
     assert_exported_stats(entries)
 
 
-# ---------------------------------------------------------------------------
-# Schema compatibility (INSERT SELECT mirror)
-#
-# EXPORT PARTITION builds an ActionsDAG via makeConvertingActions(Position) — the
-# same primitive that powers INSERT INTO dest SELECT * FROM src.  The destination
-# compatibility validation in StorageReplicatedMergeTree::exportPartitionToTable
-# mirrors what the plain-MergeTree EXPORT PART path does in MergeTreeData, so the
-# tests below pin the same contract for the replicated partition export:
-#
-#   * Column counts must match positionally; otherwise the ALTER is rejected
-#     synchronously with NUMBER_OF_COLUMNS_DOESNT_MATCH and nothing is scheduled.
-#   * Destination column names need not match source names — positional pairs
-#     are CAST element-wise.
-#   * Lossless casts (e.g. widening) are accepted; lossy casts are rejected at
-#     validation time unless export_merge_tree_part_allow_lossy_cast = 1.
-# ---------------------------------------------------------------------------
-
-
 def test_export_partition_column_count_mismatch_source_more_is_rejected(cluster):
     """
     Source has 3 columns (id, year, extra), destination has 2 (id, year).
@@ -1162,12 +1110,8 @@ def test_export_partition_with_renamed_destination_column(cluster):
 
 
 def test_export_partition_with_castable_widening(cluster):
-    """
-    Source column is Int32, destination expects Int64.  makeConvertingActions
-    inserts a widening CAST; the export must succeed and the values must land
-    as Int64 in Iceberg.  The partition column `year` keeps the same Int32 type
-    on both sides so the exact-type partition check is satisfied.
-    """
+    """A lossless widening of both a data column (id Int32 -> Int64) and the
+    partition column (year Int32 -> Int64) round-trips."""
     node = cluster.instances["replica1"]
 
     uid = unique_suffix()
@@ -1177,7 +1121,7 @@ def test_export_partition_with_castable_widening(cluster):
     make_rmt(node, mt_table, "id Int32, year Int32", "year", replica_name="replica1")
     node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020)")
 
-    make_iceberg_s3(node, iceberg_table, "id Int64, year Int32", partition_by="year")
+    make_iceberg_s3(node, iceberg_table, "id Int64, year Int64", partition_by="year")
 
     node.query(
         f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
@@ -1189,9 +1133,9 @@ def test_export_partition_with_castable_widening(cluster):
     assert count == 2, f"Expected 2 rows in Iceberg table after export, got {count}"
 
     result = node.query(
-        f"SELECT id, toTypeName(id), year FROM {iceberg_table} ORDER BY id"
+        f"SELECT id, toTypeName(id), year, toTypeName(year) FROM {iceberg_table} ORDER BY id"
     ).strip()
-    assert result == "1\tInt64\t2020\n2\tInt64\t2020", (
+    assert result == "1\tInt64\t2020\tInt64\n2\tInt64\t2020\tInt64", (
         f"Unexpected widened data:\n{result}"
     )
 
@@ -1230,6 +1174,31 @@ def test_export_partition_with_castable_narrowing_values_fit(cluster):
     )
 
 
+def test_export_partition_lossy_cast_rejected_without_optin(cluster):
+    """A lossy narrowing (id Int64 -> Int32) is rejected synchronously with
+    BAD_ARGUMENTS unless export_merge_tree_part_allow_lossy_cast is set."""
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_lossy_reject_{uid}"
+    iceberg_table = f"iceberg_lossy_reject_{uid}"
+
+    make_rmt(node, mt_table, "id Int64, year Int32", "year", replica_name="replica1")
+    node.query(f"INSERT INTO {mt_table} VALUES (1, 2020)")
+
+    make_iceberg_s3(node, iceberg_table, "id Int32, year Int32", partition_by="year")
+
+    error = node.query_and_get_error(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} "
+        f"SETTINGS allow_insert_into_iceberg = 1"
+    )
+    assert "BAD_ARGUMENTS" in error, f"Expected BAD_ARGUMENTS, got: {error!r}"
+    assert "lossy cast" in error, f"Expected 'lossy cast' in error, got: {error!r}"
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 0, f"Expected no rows after a rejected export, got {count}"
+
+
 def test_export_partition_runtime_cast_failure_propagates_async(cluster):
     """A String value that cannot be parsed as the destination Int32 passes the
     synchronous lossy-cast gate (with export_merge_tree_part_allow_lossy_cast = 1) but
@@ -1272,3 +1241,158 @@ def test_export_partition_runtime_cast_failure_propagates_async(cluster):
     assert count == 0, (
         f"Expected 0 rows in Iceberg table after failed export, got {count}"
     )
+
+
+def test_export_partition_all_iceberg_types(cluster):
+    """Every getIcebergType-supported type round-trips through an EXPORT PARTITION:
+    scalars use narrower source types (explicit lossless widening CASTs), plus
+    Array/Map/Tuple nested columns."""
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_all_types_{uid}"
+    iceberg_table = f"iceberg_all_types_{uid}"
+
+    # Scalar source types are strictly narrower than the destination; the export inserts
+    # a positional widening CAST per column (Int8->Int16, UInt32->UInt64, ...). Nested
+    # columns keep the same type on both sides.
+    source_columns = (
+        "i16 Int8, u16 UInt8, u32 UInt16, u64 UInt32, "
+        "id Int16, big Int32, f32 Float32, f64 Float64, "
+        "d Date, d32 Date32, dt DateTime, dt64 DateTime64(6), "
+        "s String, uid UUID, "
+        "arr Array(Int32), m Map(String, Int64), tup Tuple(a Int32, b String), "
+        "year Int32"
+    )
+    dest_columns = (
+        "i16 Int16, u16 UInt16, u32 UInt32, u64 UInt64, "
+        "id Int32, big Int64, f32 Float32, f64 Float64, "
+        "d Date, d32 Date32, dt DateTime, dt64 DateTime64(6), "
+        "s String, uid UUID, "
+        "arr Array(Int32), m Map(String, Int64), tup Tuple(a Int32, b String), "
+        "year Int32"
+    )
+
+    make_rmt(node, mt_table, source_columns, "year", replica_name="replica1")
+    make_iceberg_s3(node, iceberg_table, dest_columns, partition_by="year")
+
+    node.query(
+        f"""
+        INSERT INTO {mt_table}
+            (i16, u16, u32, u64, id, big, f32, f64, d, d32, dt, dt64, s, uid, arr, m, tup, year)
+        VALUES (
+            -100, 200, 50000, 4000000000,
+            12345, 1000000000, 3.14, 2.718281828459045,
+            '2024-01-15', '2024-01-15', '2024-01-15 12:30:45', '2024-01-15 12:30:45.123456',
+            'hello iceberg', '550e8400-e29b-41d4-a716-446655440000',
+            [1, 2, 3], {{'a': 10, 'b': 20}}, (7, 'seven'), 2024
+        )
+        """
+    )
+
+    node.query(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2024' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
+    )
+    wait_for_export_status(node, mt_table, iceberg_table, "2024", "COMPLETED")
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 1, f"Expected 1 row in Iceberg table, got {count}"
+
+    result = node.query(
+        f"""
+        SELECT
+            i16, u16, u32, u64, id, big,
+            toString(d), toString(d32), toString(dt),
+            s, toString(uid),
+            arr, m['a'], m['b'], tup.a, tup.b, year
+        FROM {iceberg_table}
+        """
+    ).strip()
+    expected = "\t".join([
+        "-100", "200", "50000", "4000000000",
+        "12345", "1000000000",
+        "2024-01-15", "2024-01-15", "2024-01-15 12:30:45.000000",
+        "hello iceberg", "550e8400-e29b-41d4-a716-446655440000",
+        "[1,2,3]", "10", "20", "7", "seven", "2024",
+    ])
+    assert result == expected, f"Unexpected round-trip data:\n{result!r}\nexpected:\n{expected!r}"
+
+    # Floats compared with a tolerance to avoid formatting flakiness.
+    floats_ok = node.query(
+        f"SELECT abs(f32 - 3.14) < 1e-4 AND abs(f64 - 2.718281828459045) < 1e-12 FROM {iceberg_table}"
+    ).strip()
+    assert floats_ok == "1", f"Float round-trip outside tolerance: {floats_ok!r}"
+
+    # DateTime64 sub-second component: assert the date part is preserved (exact format varies).
+    ts_result = node.query(f"SELECT dt64 FROM {iceberg_table}").strip()
+    assert "2024-01-15" in ts_result, f"DateTime64 date component missing: {ts_result!r}"
+
+
+def test_export_partition_all_iceberg_types_lossy(cluster):
+    """Lossy narrowing casts across types succeed with the opt-in flag: values that
+    fit round-trip, Float64 -> Float32 loses precision, and Nullable columns carry
+    both NULL and non-NULL (the latter via a lossy Nullable(Int64) -> Nullable(Int32))."""
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_lossy_types_{uid}"
+    iceberg_table = f"iceberg_lossy_types_{uid}"
+
+    # Each source column is wider than the destination, so the export inserts a lossy
+    # narrowing CAST (allowed only because export_merge_tree_part_allow_lossy_cast=1).
+    # Int8/UInt8 are not Iceberg-representable, so the narrowest integer dest is Int16.
+    source_columns = (
+        "big Int64, ubig UInt64, mid Int32, "
+        "f Float64, dt DateTime64(6), d Date32, "
+        "opt_s Nullable(String), opt_i Nullable(Int64), year Int32"
+    )
+    dest_columns = (
+        "big Int32, ubig UInt32, mid Int16, "
+        "f Float32, dt DateTime, d Date, "
+        "opt_s Nullable(String), opt_i Nullable(Int32), year Int32"
+    )
+
+    make_rmt(node, mt_table, source_columns, "year", replica_name="replica1")
+    make_iceberg_s3(node, iceberg_table, dest_columns, partition_by="year")
+
+    # Values chosen to fit the destination types (the async cast wraps on overflow
+    # rather than throwing, so out-of-range values would silently corrupt instead).
+    # opt_s is NULL and opt_i is set, covering both nullable paths in one row.
+    node.query(
+        f"""
+        INSERT INTO {mt_table} (big, ubig, mid, f, dt, d, opt_s, opt_i, year)
+        VALUES (
+            1000000, 2000000000, 30000,
+            2.718281828459045, '2024-01-15 12:30:45.123456', '2024-01-15',
+            NULL, 100, 2024
+        )
+        """
+    )
+
+    node.query(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2024' TO TABLE {iceberg_table}",
+        settings={
+            "allow_insert_into_iceberg": 1,
+            "export_merge_tree_part_allow_lossy_cast": 1,
+        },
+    )
+    wait_for_export_status(node, mt_table, iceberg_table, "2024", "COMPLETED")
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 1, f"Expected 1 row in Iceberg table, got {count}"
+
+    result = node.query(
+        f"SELECT big, ubig, mid, toString(d), toString(dt), opt_s, opt_i, year FROM {iceberg_table}"
+    ).strip()
+    expected = "\t".join([
+        "1000000", "2000000000", "30000",
+        "2024-01-15", "2024-01-15 12:30:45.000000", "\\N", "100", "2024",
+    ])
+    assert result == expected, f"Unexpected lossy round-trip data:\n{result!r}\nexpected:\n{expected!r}"
+
+    # Float64 -> Float32 stays within Float32 precision but is no longer exact.
+    f_checks = node.query(
+        f"SELECT abs(f - 2.718281828459045) < 1e-6, abs(f - 2.718281828459045) > 1e-9 FROM {iceberg_table}"
+    ).strip()
+    assert f_checks == "1\t1", f"Expected Float32 precision loss within tolerance, got: {f_checks!r}"

From 6f822c8074bcf9aeea7544024a5f226a429a1cfb Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 15 Jun 2026 17:55:35 -0300
Subject: [PATCH 10/33] add docs

---
 docs/en/antalya/part_export.md      | 6 ++++++
 docs/en/antalya/partition_export.md | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/docs/en/antalya/part_export.md b/docs/en/antalya/part_export.md
index 03f9f479991b..b73efa277e2a 100644
--- a/docs/en/antalya/part_export.md
+++ b/docs/en/antalya/part_export.md
@@ -107,6 +107,12 @@ In case a table function is used as the destination, the schema can be omitted a
 - **Default**: `{part_name}_{checksum}`
 - **Description**: Pattern for the filename of the exported merge tree part. The `part_name` and `checksum` are calculated and replaced on the fly. Additional macros are supported.
 
+### `export_merge_tree_part_allow_lossy_cast` (Optional)
+
+- **Type**: `Bool`
+- **Default**: `false`
+- **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
+
 
 ## Examples
 
diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md
index 975915859482..9bd1dfcd16f1 100644
--- a/docs/en/antalya/partition_export.md
+++ b/docs/en/antalya/partition_export.md
@@ -111,6 +111,12 @@ Notes:
 - Enforcement is best-effort: actual kill latency is bounded by one manifest-updater poll cycle (~30s) plus ZooKeeper watch propagation.
 - Since both this timeout and `export_merge_tree_partition_manifest_ttl` are measured from `create_time`, keep `export_merge_tree_partition_manifest_ttl` greater than `export_merge_tree_partition_task_timeout_seconds` if you want the KILLED entry to remain visible in `system.replicated_partition_exports` after the timeout fires.
 
+### `export_merge_tree_part_allow_lossy_cast` (Optional)
+
+- **Type**: `Bool`
+- **Default**: `false`
+- **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
+
 ## Examples
 
 ### Basic Export to S3

From f521a95c87278262e89bae8c6ec15218c32ced00 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 16 Jun 2026 09:59:40 -0300
Subject: [PATCH 11/33] improve docs with a warning

---
 docs/en/antalya/part_export.md      | 2 ++
 docs/en/antalya/partition_export.md | 2 ++
 src/Core/Settings.cpp               | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/docs/en/antalya/part_export.md b/docs/en/antalya/part_export.md
index b73efa277e2a..88e1c831a536 100644
--- a/docs/en/antalya/part_export.md
+++ b/docs/en/antalya/part_export.md
@@ -113,6 +113,8 @@ In case a table function is used as the destination, the schema can be omitted a
 - **Default**: `false`
 - **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
 
+  **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value.
+
 
 ## Examples
 
diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md
index 9bd1dfcd16f1..5a5dd2918869 100644
--- a/docs/en/antalya/partition_export.md
+++ b/docs/en/antalya/partition_export.md
@@ -117,6 +117,8 @@ Notes:
 - **Default**: `false`
 - **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
 
+  **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value.
+
 ## Examples
 
 ### Basic Export to S3
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 6368ec01c1cb..5fcf971ecc56 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -7608,6 +7608,8 @@ Pattern for the filename of the exported merge tree part. The `part_name` and `c
 )", 0) \
     DECLARE(Bool, export_merge_tree_part_allow_lossy_cast, false, R"(
 Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
+
+Warning: the partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Lossy casts on a partition column can therefore make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the metadata will still contain the original `Int64` values while the data files contain the truncated `Int32` values.
 )", 0) \
     \
     /* ####################################################### */ \

From 41c64eda6f35eeedb14601435f130e0ccad78993 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 16 Jun 2026 21:20:03 -0300
Subject: [PATCH 12/33] hmm... maybe incomplete, gotta continue it tomorrow

---
 docs/en/antalya/part_export.md                |  4 +-
 docs/en/antalya/partition_export.md           |  4 +-
 src/Core/Settings.cpp                         |  2 +-
 ...portReplicatedMergeTreePartitionManifest.h |  7 +++
 src/Storages/IStorage.h                       | 10 ++--
 src/Storages/MergeTree/ExportPartTask.cpp     |  2 +-
 .../MergeTree/ExportPartitionUtils.cpp        | 18 +++---
 src/Storages/MergeTree/ExportPartitionUtils.h |  7 ++-
 .../DataLakes/IDataLakeMetadata.h             |  2 +-
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 58 ++++++++++++++++++-
 .../DataLakes/Iceberg/IcebergMetadata.h       |  2 +-
 .../ObjectStorage/StorageObjectStorage.cpp    |  5 +-
 src/Storages/StorageReplicatedMergeTree.cpp   |  2 +
 13 files changed, 98 insertions(+), 25 deletions(-)

diff --git a/docs/en/antalya/part_export.md b/docs/en/antalya/part_export.md
index 88e1c831a536..73d467c5d9b1 100644
--- a/docs/en/antalya/part_export.md
+++ b/docs/en/antalya/part_export.md
@@ -113,7 +113,9 @@ In case a table function is used as the destination, the schema can be omitted a
 - **Default**: `false`
 - **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
 
-  **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value.
+  When exporting to Apache Iceberg, the partition value written to the metadata is derived from the source partition columns by casting them to the destination partition-field types and applying the destination partition transform — the same computation the exported data files use. This keeps the Iceberg metadata consistent with the data files.
+
+  **Warning:** A lossy cast on a partition column remains semantically truncating. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into a destination `Int32` partition column, both the data files and the Iceberg metadata will contain the truncated `Int32` value (they agree with each other, but the original `Int64` value is lost). Such casts require `export_merge_tree_part_allow_lossy_cast = 1`.
 
 
 ## Examples
diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md
index 5a5dd2918869..bb17a57f87cd 100644
--- a/docs/en/antalya/partition_export.md
+++ b/docs/en/antalya/partition_export.md
@@ -117,7 +117,9 @@ Notes:
 - **Default**: `false`
 - **Description**: Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
 
-  **Warning:** The partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Because of this, lossy casts can make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the Iceberg metadata will still contain the original `Int64` partition values, while the data files may contain the values after they were cast and truncated to `Int32`. As a result, the Iceberg metadata may say that a data file belongs to a partition with a value outside the `Int32` range, while the actual data file contains the truncated value.
+  When exporting to Apache Iceberg, the partition value written to the metadata is derived from the source partition columns by casting them to the destination partition-field types and applying the destination partition transform — the same computation the exported data files use. This keeps the Iceberg metadata consistent with the data files.
+
+  **Warning:** A lossy cast on a partition column remains semantically truncating. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into a destination `Int32` partition column, both the data files and the Iceberg metadata will contain the truncated `Int32` value (they agree with each other, but the original `Int64` value is lost). Such casts require `export_merge_tree_part_allow_lossy_cast = 1`.
 
 ## Examples
 
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 5fcf971ecc56..d905075c5b71 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -7609,7 +7609,7 @@ Pattern for the filename of the exported merge tree part. The `part_name` and `c
     DECLARE(Bool, export_merge_tree_part_allow_lossy_cast, false, R"(
 Allow `EXPORT PART`/`EXPORT PARTITION` to apply lossy (non-value-preserving) casts when the source and destination column types differ. When disabled, an export that would require a lossy cast throws instead.
 
-Warning: the partition key is not cast before it is written to the Apache Iceberg metadata file — it is written using the original value from the ClickHouse table. Lossy casts on a partition column can therefore make the Iceberg metadata inconsistent with the exported data files. For example, if a table is partitioned by an `Int64` column and some partition values do not fit into `Int32`, the metadata will still contain the original `Int64` values while the data files contain the truncated `Int32` values.
+When exporting to Apache Iceberg, the partition value written to the metadata is derived from the source partition columns by casting them to the destination partition-field types and applying the destination partition transform — the same computation the exported data files use, so the metadata stays consistent with the data. A lossy cast on a partition column remains semantically truncating: both the data files and the metadata contain the truncated value, and such casts require this setting to be enabled.
 )", 0) \
     \
     /* ####################################################### */ \
diff --git a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
index acfabc28ca61..db26836bdb2e 100644
--- a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
+++ b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
@@ -174,6 +174,7 @@ struct ExportReplicatedMergeTreePartitionManifest
     MergeTreePartExportManifest::FileAlreadyExistsPolicy file_already_exists_policy;
     String filename_pattern;
     bool write_full_path_in_iceberg_metadata = false;
+    bool allow_lossy_cast = false;
     String iceberg_metadata_json;
 
     std::string toJsonString() const
@@ -208,6 +209,7 @@ struct ExportReplicatedMergeTreePartitionManifest
         json.set("ttl_seconds", ttl_seconds);
         json.set("task_timeout_seconds", task_timeout_seconds);
         json.set("write_full_path_in_iceberg_metadata", write_full_path_in_iceberg_metadata);
+        json.set("allow_lossy_cast", allow_lossy_cast);
         std::ostringstream oss;     // STYLE_CHECK_ALLOW_STD_STRING_STREAM
         oss.exceptions(std::ios::failbit);
         Poco::JSON::Stringifier::stringify(json, oss);
@@ -262,6 +264,11 @@ struct ExportReplicatedMergeTreePartitionManifest
 
         manifest.write_full_path_in_iceberg_metadata = json->getValue<bool>("write_full_path_in_iceberg_metadata");
 
+        /// Default to true for tasks created before this field existed, so an in-flight
+        /// export scheduled with the old permissive worker behavior is not wrongly rejected
+        /// on upgrade. New tasks always persist the initiator's actual choice.
+        manifest.allow_lossy_cast = json->has("allow_lossy_cast") ? json->getValue<bool>("allow_lossy_cast") : true;
+
         return manifest;
     }
 };
diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h
index 0a07b892cef8..93caf3f65ebe 100644
--- a/src/Storages/IStorage.h
+++ b/src/Storages/IStorage.h
@@ -504,11 +504,11 @@ It is currently only implemented in StorageObjectStorage.
     struct IcebergCommitExportPartitionArguments
     {
       std::string metadata_json_string;
-      /// Partition column values (after transforms). Callers are responsible for
-      /// populating this: the partition-export path parses them from the persisted
-      /// JSON string, while the direct EXPORT PART path reads them from the part's
-      /// partition key.
-      std::vector<Field> partition_values;
+      /// Representative source partition-key columns from one exported part (the part's
+      /// minmax block). The destination derives the Iceberg partition tuple from a row of
+      /// this block by casting to the destination column types and applying the partition
+      /// transform, so the metadata partition value matches the exported data files.
+      Block partition_source_block;
     };
 
     virtual void commitExportPartitionTransaction(
diff --git a/src/Storages/MergeTree/ExportPartTask.cpp b/src/Storages/MergeTree/ExportPartTask.cpp
index f2035577bbf6..e626e7364eaa 100644
--- a/src/Storages/MergeTree/ExportPartTask.cpp
+++ b/src/Storages/MergeTree/ExportPartTask.cpp
@@ -332,7 +332,7 @@ bool ExportPartTask::executeStep()
         {
             IStorage::IcebergCommitExportPartitionArguments iceberg_args;
             iceberg_args.metadata_json_string = manifest.iceberg_metadata_json;
-            iceberg_args.partition_values = manifest.data_part->partition.value;
+            iceberg_args.partition_source_block = block_with_partition_values;
 
             destination_storage->commitExportPartitionTransaction(
                 manifest.transaction_id,
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp
index c902a4042218..679e07d0b132 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.cpp
+++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp
@@ -57,14 +57,13 @@ namespace fs = std::filesystem;
 
 namespace ExportPartitionUtils
 {
-    std::vector<Field> getPartitionValuesForIcebergCommit(
+    Block getPartitionSourceBlockForIcebergCommit(
         MergeTreeData & storage, const String & partition_id)
     {
         auto lock = storage.readLockParts();
         const auto parts = storage.getDataPartsVectorInPartitionForInternalUsage(
             MergeTreeDataPartState::Active, partition_id, lock);
-        
-        /// todo arthur: bad arguments for now, pick a better one
+
         if (parts.empty())
             throw Exception(ErrorCodes::NO_SUCH_DATA_PART,
                 "Cannot find active part for partition_id '{}' to derive Iceberg partition "
@@ -72,7 +71,7 @@ namespace ExportPartitionUtils
                 "or this replica has not yet received any part for this partition. "
                 "The commit will be retried.",
                 partition_id);
-        return parts.front()->partition.value;
+        return parts.front()->minmax_idx->getBlock(storage);
     }
 
     ContextPtr getContextCopyWithTaskSettings(const ContextPtr & context, const ExportReplicatedMergeTreePartitionManifest & manifest)
@@ -102,8 +101,11 @@ namespace ExportPartitionUtils
         /// stalls when the setting is only set at the query level.
         context_copy->setSetting("allow_insert_into_iceberg", true);
 
-        /// This setting, just like allow_insert_into_iceberg, has been verified at request time. At this point, we allow everything.
-        context_copy->setSetting("export_merge_tree_part_allow_lossy_cast", true);
+        /// Reapply the initiator's lossy-cast decision (persisted in the manifest) so the
+        /// worker's schema revalidation honors the user's choice. Without this, a task
+        /// scheduled without the opt-in could still apply a lossy cast if the destination
+        /// schema drifts to a lossy target between scheduling and execution.
+        context_copy->setSetting("export_merge_tree_part_allow_lossy_cast", manifest.allow_lossy_cast);
 
 	    return context_copy;
     }
@@ -217,8 +219,8 @@ namespace ExportPartitionUtils
         {
             iceberg_args.metadata_json_string = manifest.iceberg_metadata_json;
             if (source_storage.getInMemoryMetadataPtr()->hasPartitionKey())
-                iceberg_args.partition_values =
-                    getPartitionValuesForIcebergCommit(source_storage, manifest.partition_id);
+                iceberg_args.partition_source_block =
+                    getPartitionSourceBlockForIcebergCommit(source_storage, manifest.partition_id);
         }
 
         destination_storage->commitExportPartitionTransaction(manifest.transaction_id, manifest.partition_id, exported_paths, iceberg_args, context);
diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h
index d97bb538a47f..0434bc59a2cb 100644
--- a/src/Storages/MergeTree/ExportPartitionUtils.h
+++ b/src/Storages/MergeTree/ExportPartitionUtils.h
@@ -27,14 +27,15 @@ namespace ExportPartitionUtils
 
     ContextPtr getContextCopyWithTaskSettings(const ContextPtr & context, const ExportReplicatedMergeTreePartitionManifest & manifest);
 
-    /// Returns the partition key values for the given partition_id by reading from
-    /// the first active local part. Throws LOGICAL_ERROR if no such part is found.
+    /// Returns the representative source partition-key columns (the first active local part's
+    /// minmax block) for the given partition_id. The destination recomputes the Iceberg partition
+    /// tuple from this block by casting to its column types and applying the partition transform.
     ///
     /// Edge case: if the partition was dropped after export started, or this replica
     /// has not yet received any part for this partition (extreme replication lag on a
     /// recovery path), no active part will be found and the commit will fail. The task
     /// will be retried on the next poll cycle or picked up by a different replica.
-    std::vector<Field> getPartitionValuesForIcebergCommit(
+    Block getPartitionSourceBlockForIcebergCommit(
         MergeTreeData & storage, const String & partition_id);
 
     void commit(
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
index 85c91ba5de19..a90c49b35210 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
@@ -226,7 +226,7 @@ class IDataLakeMetadata : boost::noncopyable
         const String & /* transaction_id */,
         Int64 /* original_schema_id */,
         Int64 /* partition_spec_id */,
-        const std::vector<Field> & /* partition_values */,
+        const Block & /* partition_source_block */,
         SharedHeader /* sample_block */,
         const std::vector<String> & /* data_file_paths */,
         ContextPtr /* context */)
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 6016be1cd341..9819a8783516 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -30,6 +30,7 @@
 #include <Common/Exception.h>
 
 #include <Interpreters/PreparedSets.h>
+#include <Interpreters/castColumn.h>
 #include <Storages/ObjectStorage/Utils.h>
 
 #include <Databases/DataLake/Common.h>
@@ -1524,6 +1525,57 @@ Poco::JSON::Object::Ptr lookupSchema(const Poco::JSON::Object::Ptr & meta, Int64
         "Schema with id {} not found in table metadata", schema_id);
 }
 
+/// Derive the Iceberg partition tuple for an exported part from a representative source row.
+/// The MergeTree `partition.value` is the source partition-key expression result; it is neither
+/// cast to the destination column types nor expressed through the Iceberg transform, so it must
+/// not be written to metadata directly. Within a MergeTree partition the transform result is
+/// constant, so a single representative value per partition-source column (taken from the part's
+/// minmax block) suffices: cast it to the destination column type and run the same transform the
+/// data uses. The result is transform-correct and consistent with the exported data files.
+std::vector<Field> recomputeExportPartitionValues(
+    ChunkPartitioner & partitioner,
+    const SharedHeader & sample_block,
+    const Block & partition_source_block)
+{
+    const auto & partition_columns = partitioner.getColumns();
+    if (partition_columns.empty())
+        return {};
+
+    for (const auto & column_name : partition_columns)
+        if (!partition_source_block.has(column_name))
+            throw Exception(ErrorCodes::LOGICAL_ERROR,
+                "Partition source column '{}' required by the Iceberg partition transform is missing "
+                "from the representative source block while committing an export.", column_name);
+
+    Columns columns;
+    columns.reserve(sample_block->columns());
+    for (size_t i = 0; i < sample_block->columns(); ++i)
+    {
+        const auto & dest_column = sample_block->getByPosition(i);
+        if (partition_source_block.has(dest_column.name))
+        {
+            const auto & source = partition_source_block.getByName(dest_column.name);
+            ColumnWithTypeAndName representative{source.column->cut(0, 1), source.type, source.name};
+            columns.push_back(castColumn(representative, dest_column.type));
+        }
+        else
+        {
+            auto column = dest_column.type->createColumn();
+            column->insertDefault();
+            columns.push_back(std::move(column));
+        }
+    }
+
+    auto partitioned = partitioner.partitionChunk(Chunk(std::move(columns), 1));
+    if (partitioned.size() != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR,
+            "Recomputing Iceberg partition values produced {} partitions for a single representative row; "
+            "a MergeTree partition must map to exactly one Iceberg partition.", partitioned.size());
+
+    const auto & key = partitioned.front().first;
+    return std::vector<Field>(key.begin(), key.end());
+}
+
 }
 
 bool IcebergMetadata::commitImportPartitionTransactionImpl(
@@ -1826,7 +1878,7 @@ void IcebergMetadata::commitExportPartitionTransaction(
     const String & transaction_id,
     Int64 original_schema_id,
     Int64 partition_spec_id,
-    const std::vector<Field> & partition_values,
+    const Block & partition_source_block,
     SharedHeader sample_block,
     const std::vector<String> & data_file_paths,
     ContextPtr context)
@@ -1895,6 +1947,10 @@ void IcebergMetadata::commitExportPartitionTransaction(
     const auto partition_columns = partitioner.getColumns();
     const auto partition_types = partitioner.getResultTypes();
 
+    /// Recompute the partition tuple via the destination transform so the metadata partition
+    /// value matches the exported data (rather than the raw source MergeTree partition value).
+    const auto partition_values = recomputeExportPartitionValues(partitioner, sample_block, partition_source_block);
+
     const auto metadata_compression_method = persistent_components.metadata_compression_method;
 
     FileNamesGenerator filename_generator = FileNamesGenerator(
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 2e28df93e0e6..2ec8efe4a9b7 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -141,7 +141,7 @@ class IcebergMetadata : public IDataLakeMetadata
         const String & transaction_id,
         Int64 original_schema_id,
         Int64 partition_spec_id,
-        const std::vector<Field> & partition_values,
+        const Block & partition_source_block,
         SharedHeader sample_block,
         const std::vector<String> & data_file_paths,
         ContextPtr context) override;
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
index c08db678c46c..50f8c373767d 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
@@ -680,7 +680,8 @@ void StorageObjectStorage::commitExportPartitionTransaction(
         /// Parse the Iceberg metadata snapshot (stored in ZooKeeper at export-start time) only to
         /// extract the schema-id and partition-spec-id that were current when the export began.
         /// partition_columns and partition_types are derived inside commitExportPartitionTransaction
-        /// from the same JSON, so only partition_values need to be carried here.
+        /// from the same JSON; the representative source partition columns are carried here so the
+        /// partition tuple can be recomputed through the destination transform.
         Poco::JSON::Parser iceberg_parser;
         Poco::JSON::Object::Ptr iceberg_metadata =
             iceberg_parser.parse(iceberg_commit_export_partition_arguments.metadata_json_string).extract<Poco::JSON::Object::Ptr>();
@@ -695,7 +696,7 @@ void StorageObjectStorage::commitExportPartitionTransaction(
             transaction_id,
             original_schema_id,
             partition_spec_id,
-            iceberg_commit_export_partition_arguments.partition_values,
+            iceberg_commit_export_partition_arguments.partition_source_block,
             std::make_shared<const Block>(getInMemoryMetadataPtr()->getSampleBlock()),
             exported_paths,
             local_context);
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 2301ec1c1a30..10eee4e5db95 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -225,6 +225,7 @@ namespace Setting
     extern const SettingsUInt64 export_merge_tree_part_max_rows_per_file;
     extern const SettingsBool export_merge_tree_part_throw_on_pending_mutations;
     extern const SettingsBool export_merge_tree_part_throw_on_pending_patch_parts;
+    extern const SettingsBool export_merge_tree_part_allow_lossy_cast;
     extern const SettingsExportPartitionAllOnError export_merge_tree_partition_all_on_error;
     extern const SettingsString export_merge_tree_part_filename_pattern;
     extern const SettingsBool write_full_path_in_iceberg_metadata;
@@ -8537,6 +8538,7 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
     manifest.file_already_exists_policy = query_context->getSettingsRef()[Setting::export_merge_tree_part_file_already_exists_policy].value;
     manifest.filename_pattern = query_context->getSettingsRef()[Setting::export_merge_tree_part_filename_pattern].value;
     manifest.write_full_path_in_iceberg_metadata = query_context->getSettingsRef()[Setting::write_full_path_in_iceberg_metadata];
+    manifest.allow_lossy_cast = query_context->getSettingsRef()[Setting::export_merge_tree_part_allow_lossy_cast];
 
     if (dest_storage->isDataLake())
     {

From 3b177ed647fe9c74fabe99930d7b8e89c9c34985 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 17 Jun 2026 09:51:12 -0300
Subject: [PATCH 13/33] add tests covering corner tricky case

---
 .../test.py                                   | 246 ++++++++++++++++++
 1 file changed, 246 insertions(+)

diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 09cf1c2ff467..00cf5b5ff6f2 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -1396,3 +1396,249 @@ def test_export_partition_all_iceberg_types_lossy(cluster):
         f"SELECT abs(f - 2.718281828459045) < 1e-6, abs(f - 2.718281828459045) > 1e-9 FROM {iceberg_table}"
     ).strip()
     assert f_checks == "1\t1", f"Expected Float32 precision loss within tolerance, got: {f_checks!r}"
+
+
+def _data_file_partition_records(entries):
+    """Partition dicts of the non-delete data files described by manifest entries."""
+    records = []
+    for entry in entries:
+        data_file = entry.get("data_file") or {}
+        if data_file.get("content", 0) not in (0, None):
+            continue
+        partition = data_file.get("partition")
+        if partition is not None:
+            records.append(partition)
+    return records
+
+
+def _partition_scalar(partition, field):
+    """Read a partition field value, tolerating an Avro-union ``{type: value}`` wrapper."""
+    value = partition.get(field)
+    if isinstance(value, dict):
+        assert len(value) == 1, f"Unexpected partition union shape for {field!r}: {value!r}"
+        value = next(iter(value.values()))
+    return value
+
+
+def test_export_partition_bucket_transform_metadata_matches_data(cluster):
+    """A bucket[N] partition column whose type changes Int64 -> String records the
+    destination murmur(String) bucket in the Iceberg metadata, matching the exported
+    data rather than the source hashLong bucket."""
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_bucket_xform_{uid}"
+    iceberg_table = f"iceberg_bucket_xform_{uid}"
+
+    # N=16, key=42 diverges: icebergBucket(16, 42::Int64)=14 (source/old hashLong) but
+    # icebergBucket(16, '42')=6 (destination/new murmur over the exported String).
+    make_rmt(node, mt_table, "id Int64, key Int64", "icebergBucket(16, key)",
+             replica_name="replica1")
+    node.query(f"INSERT INTO {mt_table} VALUES (1, 42), (2, 42)")
+
+    make_iceberg_s3(node, iceberg_table, "id Int64, key String",
+                    partition_by="icebergBucket(16, key)")
+
+    pid = first_partition_id(node, mt_table)
+    node.query(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
+    )
+    wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED")
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 2, f"Expected 2 rows after export, got {count}"
+
+    string_bucket = int(node.query(
+        f"SELECT DISTINCT icebergBucket(16, key) FROM {iceberg_table}"
+    ).strip())
+    long_bucket = int(node.query(
+        f"SELECT DISTINCT icebergBucket(16, toInt64(key)) FROM {iceberg_table}"
+    ).strip())
+    assert string_bucket != long_bucket, (
+        f"Test setup invalid: String and Int64 buckets coincide ({string_bucket}); "
+        f"pick a different N/key so the transform diverges."
+    )
+
+    query_id = f"bucket_xform_{uid}"
+    node.query(
+        f"SELECT * FROM {iceberg_table}",
+        query_id=query_id,
+        settings={"iceberg_metadata_log_level": "manifest_file_entry"},
+    )
+    entries = fetch_manifest_entries(node, query_id)
+    partitions = _data_file_partition_records(entries)
+    assert partitions, "No data-file partition records found in manifest entries"
+    meta_values = {int(_partition_scalar(p, "key")) for p in partitions}
+    assert meta_values == {string_bucket}, (
+        f"Metadata bucket {meta_values} must equal the destination String bucket "
+        f"{string_bucket} (not the source Int64 bucket {long_bucket})."
+    )
+
+
+def test_export_partition_month_transform_metadata_matches_data(cluster):
+    """A month-transform partition records a months-since-epoch value in metadata that
+    matches the value derived from the exported data, and a transform-filtered read
+    returns the rows."""
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_month_xform_{uid}"
+    iceberg_table = f"iceberg_month_xform_{uid}"
+
+    make_rmt(node, mt_table, "id Int64, event_date Date",
+             "toMonthNumSinceEpoch(event_date)", replica_name="replica1")
+    node.query(
+        f"INSERT INTO {mt_table} VALUES "
+        f"(1, '2024-03-05'), (2, '2024-03-20'), (3, '2024-03-31')"
+    )
+
+    make_iceberg_s3(node, iceberg_table, "id Int64, event_date Date",
+                    partition_by="toMonthNumSinceEpoch(event_date)")
+
+    pid = first_partition_id(node, mt_table)
+    node.query(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
+    )
+    wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED")
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 3, f"Expected 3 rows after export, got {count}"
+
+    month_num = int(node.query(
+        f"SELECT DISTINCT toMonthNumSinceEpoch(event_date) FROM {iceberg_table}"
+    ).strip())
+
+    query_id = f"month_xform_{uid}"
+    node.query(
+        f"SELECT * FROM {iceberg_table}",
+        query_id=query_id,
+        settings={"iceberg_metadata_log_level": "manifest_file_entry"},
+    )
+    entries = fetch_manifest_entries(node, query_id)
+    partitions = _data_file_partition_records(entries)
+    assert partitions, "No data-file partition records found in manifest entries"
+    meta_values = {int(_partition_scalar(p, "event_date")) for p in partitions}
+    assert meta_values == {month_num}, (
+        f"Metadata month {meta_values} must equal toMonthNumSinceEpoch over the data "
+        f"({month_num})."
+    )
+
+    filtered = int(node.query(
+        f"SELECT count() FROM {iceberg_table} "
+        f"WHERE toMonthNumSinceEpoch(event_date) = {month_num}"
+    ).strip())
+    assert filtered == 3, f"Transform-filtered read expected 3 rows, got {filtered}"
+
+
+def test_export_partition_identity_type_change_metadata_matches_data(cluster):
+    """An identity partition column whose type changes UInt16 -> String records the
+    destination String value in the Iceberg metadata, matching the exported data."""
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_identity_xform_{uid}"
+    iceberg_table = f"iceberg_identity_xform_{uid}"
+
+    make_rmt(node, mt_table, "id Int32, year UInt16", "year", replica_name="replica1")
+    node.query(f"INSERT INTO {mt_table} VALUES (1, 2024), (2, 2024)")
+
+    make_iceberg_s3(node, iceberg_table, "id Int32, year String", partition_by="year")
+
+    pid = first_partition_id(node, mt_table)
+    node.query(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
+    )
+    wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED")
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 2, f"Expected 2 rows after export, got {count}"
+
+    data_year = node.query(f"SELECT DISTINCT year FROM {iceberg_table}").strip()
+    assert data_year == "2024", f"Expected exported year '2024' (String), got {data_year!r}"
+
+    query_id = f"identity_xform_{uid}"
+    node.query(
+        f"SELECT * FROM {iceberg_table}",
+        query_id=query_id,
+        settings={"iceberg_metadata_log_level": "manifest_file_entry"},
+    )
+    entries = fetch_manifest_entries(node, query_id)
+    partitions = _data_file_partition_records(entries)
+    assert partitions, "No data-file partition records found in manifest entries"
+    meta_values = {str(_partition_scalar(p, "year")) for p in partitions}
+    assert meta_values == {"2024"}, (
+        f"Metadata partition {meta_values} must equal the destination String value "
+        f"'2024' (not the source integer representation)."
+    )
+
+
+def test_export_partition_multicolumn_identity_metadata_matches_data(cluster):
+    """A multi-column identity partition (event_date Date, retention UInt64 -> Int64)
+    records per-column values in the Iceberg metadata that match the exported data."""
+    node = cluster.instances["replica1"]
+
+    uid = unique_suffix()
+    mt_table = f"mt_multicol_{uid}"
+    iceberg_table = f"iceberg_multicol_{uid}"
+
+    # Iceberg has no unsigned types, so retention widens UInt64 -> Int64; the cast is
+    # not value-preserving per canBeSafelyCast, hence the lossy opt-in below.
+    make_rmt(node, mt_table, "id Int64, event_date Date, retention UInt64",
+             "(event_date, retention)", replica_name="replica1")
+    node.query(
+        f"INSERT INTO {mt_table} VALUES "
+        f"(1, '2024-03-05', 30), (2, '2024-03-05', 30), (3, '2024-03-05', 30)"
+    )
+
+    make_iceberg_s3(node, iceberg_table, "id Int64, event_date Date, retention Int64",
+                    partition_by="(event_date, retention)")
+
+    pid = first_partition_id(node, mt_table)
+    node.query(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '{pid}' TO TABLE {iceberg_table}",
+        settings={
+            "allow_insert_into_iceberg": 1,
+            "export_merge_tree_part_allow_lossy_cast": 1,
+        },
+    )
+    wait_for_export_status(node, mt_table, iceberg_table, pid, "COMPLETED")
+
+    count = int(node.query(f"SELECT count() FROM {iceberg_table}").strip())
+    assert count == 3, f"Expected 3 rows after export, got {count}"
+
+    data_retention = int(node.query(
+        f"SELECT DISTINCT retention FROM {iceberg_table}"
+    ).strip())
+    assert data_retention == 30, f"Expected exported retention 30, got {data_retention}"
+
+    days = int(node.query(
+        f"SELECT DISTINCT toInt64(event_date) FROM {iceberg_table}"
+    ).strip())
+
+    query_id = f"multicol_{uid}"
+    node.query(
+        f"SELECT * FROM {iceberg_table}",
+        query_id=query_id,
+        settings={"iceberg_metadata_log_level": "manifest_file_entry"},
+    )
+    entries = fetch_manifest_entries(node, query_id)
+    partitions = _data_file_partition_records(entries)
+    assert partitions, "No data-file partition records found in manifest entries"
+
+    meta_dates = {int(_partition_scalar(p, "event_date")) for p in partitions}
+    assert meta_dates == {days}, (
+        f"Metadata event_date {meta_dates} must equal days-since-epoch {days}."
+    )
+    meta_retentions = {int(_partition_scalar(p, "retention")) for p in partitions}
+    assert meta_retentions == {30}, (
+        f"Metadata retention {meta_retentions} must equal the exported value 30."
+    )
+
+    filtered = int(node.query(
+        f"SELECT count() FROM {iceberg_table} "
+        f"WHERE event_date = '2024-03-05' AND retention = 30"
+    ).strip())
+    assert filtered == 3, f"Partition-filtered read expected 3 rows, got {filtered}"

From 4a75d529c95b34897183bd4d110930ee97e73763 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 16 Jun 2026 17:09:41 -0300
Subject: [PATCH 14/33] rmv task ttl

---
 docs/en/antalya/partition_export.md           |  9 +----
 src/Core/Settings.cpp                         |  6 +--
 ...portReplicatedMergeTreePartitionManifest.h |  3 --
 .../ExportPartitionManifestUpdatingTask.cpp   | 37 +++++-------------
 src/Storages/StorageReplicatedMergeTree.cpp   | 33 ++--------------
 .../test.py                                   | 35 +++++++++--------
 .../test.py                                   | 39 ++++++++++---------
 7 files changed, 53 insertions(+), 109 deletions(-)

diff --git a/docs/en/antalya/partition_export.md b/docs/en/antalya/partition_export.md
index bb17a57f87cd..95a2cfe36fb4 100644
--- a/docs/en/antalya/partition_export.md
+++ b/docs/en/antalya/partition_export.md
@@ -59,7 +59,7 @@ TO TABLE [destination_database.]destination_table
 
 - **Type**: `Bool`
 - **Default**: `false`
-- **Description**: Ignore existing partition export and overwrite the ZooKeeper entry. Allows re-exporting a partition to the same destination before the manifest expires. **IMPORTANT:** this is dangerous because it can lead to duplicated data, use it with caution.
+- **Description**: Ignore existing partition export and overwrite the ZooKeeper entry. Allows re-exporting a partition that was already exported to the same destination. **IMPORTANT:** this is dangerous because it can lead to duplicated data, use it with caution.
 
 #### `export_merge_tree_partition_max_retries` (Optional)
 
@@ -67,12 +67,6 @@ TO TABLE [destination_database.]destination_table
 - **Default**: `3`
 - **Description**: Maximum number of retries for exporting a merge tree part in an export partition task. If it exceeds, the entire task fails.
 
-#### `export_merge_tree_partition_manifest_ttl` (Optional)
-
-- **Type**: `UInt64`
-- **Default**: `180` (seconds)
-- **Description**: Determines how long the manifest will live in ZooKeeper. It prevents the same partition from being exported twice to the same destination. This setting does not affect or delete in-progress tasks; it only cleans up completed ones.
-
 #### `export_merge_tree_part_file_already_exists_policy` (Optional)
 
 - **Type**: `MergeTreePartExportFileAlreadyExistsPolicy`
@@ -109,7 +103,6 @@ When the timeout is exceeded the task transitions to KILLED (same terminal state
 
 Notes:
 - Enforcement is best-effort: actual kill latency is bounded by one manifest-updater poll cycle (~30s) plus ZooKeeper watch propagation.
-- Since both this timeout and `export_merge_tree_partition_manifest_ttl` are measured from `create_time`, keep `export_merge_tree_partition_manifest_ttl` greater than `export_merge_tree_partition_task_timeout_seconds` if you want the KILLED entry to remain visible in `system.replicated_partition_exports` after the timeout fires.
 
 ### `export_merge_tree_part_allow_lossy_cast` (Optional)
 
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index d905075c5b71..7a9089880d96 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -7557,10 +7557,6 @@ Ignore existing partition export and overwrite the zookeeper entry
 )", 0) \
     DECLARE(UInt64, export_merge_tree_partition_max_retries, 3, R"(
 Maximum number of retries for exporting a merge tree part in an export partition task
-)", 0) \
-    DECLARE(UInt64, export_merge_tree_partition_manifest_ttl, 86400, R"(
-Determines how long the manifest will live in ZooKeeper. It prevents the same partition from being exported twice to the same destination.
-This setting does not affect / delete in progress tasks. It'll only cleanup the completed ones.
 )", 0) \
     DECLARE(UInt64, export_merge_tree_partition_task_timeout_seconds, 86400, R"(
 Maximum wall-clock duration (in seconds) an export partition task is allowed to remain in the PENDING state before it is auto-killed by the background cleanup loop.
@@ -7573,7 +7569,6 @@ In such scenario, ClickHouse would attempt to commit those files again producing
 
 Notes:
 - Enforcement is best-effort: actual kill latency is bounded by one manifest-updater poll cycle (~30s) plus ZooKeeper watch propagation.
-- Since both this timeout and `export_merge_tree_partition_manifest_ttl` are measured from `create_time`, keep `export_merge_tree_partition_manifest_ttl` greater than `export_merge_tree_partition_task_timeout_seconds` if you want the KILLED entry to remain visible in `system.replicated_partition_exports` after the timeout fires.
 )", 0) \
     DECLARE(MergeTreePartExportFileAlreadyExistsPolicy, export_merge_tree_part_file_already_exists_policy, MergeTreePartExportFileAlreadyExistsPolicy::skip, R"(
 Possible values:
@@ -7948,6 +7943,7 @@ Maximum number of WebAssembly UDF instances that can run in parallel per functio
 
 #define OBSOLETE_SETTINGS(M, ALIAS) \
     /** Obsolete settings which are kept around for compatibility reasons. They have no effect anymore. */ \
+    MAKE_OBSOLETE(M, UInt64, export_merge_tree_partition_manifest_ttl, 86400) \
     MAKE_OBSOLETE(M, Bool, query_condition_cache_store_conditions_as_plaintext, false) \
     MAKE_OBSOLETE(M, Bool, update_insert_deduplication_token_in_dependent_materialized_views, 0) \
     MAKE_OBSOLETE(M, UInt64, max_memory_usage_for_all_queries, 0) \
diff --git a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
index db26836bdb2e..4e28e5e4f505 100644
--- a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
+++ b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h
@@ -164,7 +164,6 @@ struct ExportReplicatedMergeTreePartitionManifest
     std::vector<String> parts;
     time_t create_time;
     size_t max_retries;
-    size_t ttl_seconds;
     size_t task_timeout_seconds;
     size_t max_threads;
     bool parallel_formatting;
@@ -206,7 +205,6 @@ struct ExportReplicatedMergeTreePartitionManifest
         json.set("filename_pattern", filename_pattern);
         json.set("create_time", create_time);
         json.set("max_retries", max_retries);
-        json.set("ttl_seconds", ttl_seconds);
         json.set("task_timeout_seconds", task_timeout_seconds);
         json.set("write_full_path_in_iceberg_metadata", write_full_path_in_iceberg_metadata);
         json.set("allow_lossy_cast", allow_lossy_cast);
@@ -242,7 +240,6 @@ struct ExportReplicatedMergeTreePartitionManifest
             manifest.parts.push_back(parts_array->getElement<String>(static_cast<unsigned int>(i)));
         
         manifest.create_time = json->getValue<time_t>("create_time");
-        manifest.ttl_seconds = json->getValue<size_t>("ttl_seconds");
         manifest.task_timeout_seconds = json->getValue<size_t>("task_timeout_seconds");
         manifest.max_threads = json->getValue<size_t>("max_threads");
         manifest.parallel_formatting = json->getValue<bool>("parallel_formatting");
diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
index b45f3667be87..6926ee77953a 100644
--- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
+++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
@@ -128,20 +128,19 @@ namespace
     }
 
     /*
-        Remove expired entries and fix non-committed exports that have already exported all parts.
+        Enforce the PENDING task timeout and recover non-committed exports that have already
+        exported all parts. Entries are never removed for age — `system.replicated_partition_exports`
+        is append-only history.
 
-        Return values:
-        - true: the cleanup was successful, the entry is removed from the entries_by_key container and the function returns true. Proceed to the next entry.
-        - false: the cleanup was not successful, the entry is not removed from the entries_by_key container and the function returns false.
+        Always returns false: the entry stays in the in-memory container. A KILLED transition is
+        driven by the status watch, and a deferred commit is handled by the caller after the lock
+        is released.
 
         Side outputs:
         - `deferred_commits`: when a PENDING entry has all parts processed but the export was
           never committed, this function appends a CommitRecoveryWork item to be executed by
           the caller after releasing the storage-wide mutex. The actual commit() call (which
           performs network I/O to the destination catalog and S3) MUST NOT run under the lock.
-          The function still returns `false` in that case so the outer poll() loop falls through
-          to `addTask`, keeping the in-memory entry consistent regardless of whether the
-          deferred commit ultimately succeeds.
     */
     bool tryCleanup(
         const zkutil::ZooKeeperPtr & zk,
@@ -149,33 +148,17 @@ namespace
         const LoggerPtr & log,
         const ContextPtr & storage_context,
         StorageReplicatedMergeTree & storage,
-        const std::string & key,
         const ExportReplicatedMergeTreePartitionManifest & metadata,
         const time_t now,
         const bool is_pending,
-        auto & entries_by_key,
         std::vector<CommitRecoveryWork> & deferred_commits
     )
     {
-        bool has_expired = metadata.create_time < now - static_cast<time_t>(metadata.ttl_seconds);
-
         bool task_timed_out = is_pending
             && metadata.task_timeout_seconds > 0
             && metadata.create_time + static_cast<time_t>(metadata.task_timeout_seconds) < now;
 
-        if (has_expired && !is_pending)
-        {
-            zk->tryRemoveRecursive(fs::path(entry_path));
-            ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests);
-            ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRemoveRecursive);
-            auto it = entries_by_key.find(key);
-            if (it != entries_by_key.end())
-                entries_by_key.erase(it);
-            LOG_INFO(log, "ExportPartition Manifest Updating Task: Removed {}: expired", key);
-
-            return true;
-        }
-        else if (task_timed_out)
+        if (task_timed_out)
         {
             const std::string status_path = fs::path(entry_path) / "status";
 
@@ -365,8 +348,8 @@ void ExportPartitionManifestUpdatingTask::poll()
     const std::string cleanup_lock_path = fs::path(storage.zookeeper_path) / "exports_cleanup_lock";
 
     /// The `exports_cleanup_lock` is an ephemeral ZK node that serializes cleanup work
-    /// across replicas: only the replica holding it walks `tryCleanup` (entry expiry +
-    /// commit recovery). It MUST outlive the deferred-commit loop below; otherwise a peer
+    /// across replicas: only the replica holding it walks `tryCleanup` (task-timeout
+    /// enforcement + commit recovery). It MUST outlive the deferred-commit loop below; otherwise a peer
     /// replica's next poll() could acquire it and race us on the same commit-recovery work,
     /// duplicating REST-catalog round-trips and snapshot writes. The EphemeralNodeHolder
     /// destructor removes the node, so we declare it at function scope and let it die
@@ -478,11 +461,9 @@ void ExportPartitionManifestUpdatingTask::poll()
                     storage.log.load(),
                     storage.getContext(),
                     storage,
-                    key,
                     metadata,
                     now,
                     *status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING,
-                    entries_by_key,
                     deferred_commits);
 
                 if (cleanup_successful)
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 10eee4e5db95..046399f0bdfe 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -215,7 +215,6 @@ namespace Setting
     extern const SettingsBool allow_experimental_export_merge_tree_part;
     extern const SettingsBool export_merge_tree_partition_force_export;
     extern const SettingsUInt64 export_merge_tree_partition_max_retries;
-    extern const SettingsUInt64 export_merge_tree_partition_manifest_ttl;
     extern const SettingsUInt64 export_merge_tree_partition_task_timeout_seconds;
     extern const SettingsBool output_format_parallel_formatting;
     extern const SettingsBool output_format_parquet_parallel_encoding;
@@ -8416,36 +8415,11 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
     ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperExists);
     if (zookeeper->exists(partition_exports_path))
     {
-        LOG_INFO(log, "Export with key {} is already exported or it is being exported. Checking if it has expired so that we can overwrite it", export_key);
+        LOG_INFO(log, "Export with key {} is already exported or it is being exported", export_key);
 
-        bool has_expired = false;
-
-        ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests);
-        ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperExists);
-        if (zookeeper->exists(fs::path(partition_exports_path) / "metadata.json"))
-        {
-            std::string metadata_json;
-            ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests);
-            ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGet);
-            if (zookeeper->tryGet(fs::path(partition_exports_path) / "metadata.json", metadata_json))
-            {
-                const auto manifest = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json);
-
-                const auto now = time(nullptr);
-                const auto expiration_time = manifest.create_time + manifest.ttl_seconds;
-
-                LOG_INFO(log, "Export with key {} has expiration time {}, now is {}", export_key, expiration_time, now);
-
-                if (static_cast<time_t>(expiration_time) < now)
-                {
-                    has_expired = true;
-                }
-            }
-        }
-
-        if (!has_expired && !query_context->getSettingsRef()[Setting::export_merge_tree_partition_force_export])
+        if (!query_context->getSettingsRef()[Setting::export_merge_tree_partition_force_export])
         {
-            throw Exception(ErrorCodes::EXPORT_PARTITION_ALREADY_EXPORTED, "Export with key {} already exported or it is being exported, and it has not expired. Set `export_merge_tree_partition_force_export` to overwrite it.", export_key);
+            throw Exception(ErrorCodes::EXPORT_PARTITION_ALREADY_EXPORTED, "Export with key {} already exported or it is being exported. Set `export_merge_tree_partition_force_export` to overwrite it.", export_key);
         }
 
         LOG_INFO(log, "Overwriting export with key {}", export_key);
@@ -8527,7 +8501,6 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
     manifest.parts = part_names;
     manifest.create_time = time(nullptr);
     manifest.max_retries = query_context->getSettingsRef()[Setting::export_merge_tree_partition_max_retries];
-    manifest.ttl_seconds = query_context->getSettingsRef()[Setting::export_merge_tree_partition_manifest_ttl];
     manifest.task_timeout_seconds = query_context->getSettingsRef()[Setting::export_merge_tree_partition_task_timeout_seconds];
     manifest.max_threads = query_context->getSettingsRef()[Setting::max_threads];
     manifest.parallel_formatting = query_context->getSettingsRef()[Setting::output_format_parallel_formatting];
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 00cf5b5ff6f2..89ceeca47003 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -746,13 +746,12 @@ def test_partition_key_compatibility_check(cluster):
     )
 
 
-def test_export_ttl(cluster):
+def test_export_entries_never_expire(cluster):
     """
-    After a manifest TTL expires the same partition can be re-exported, and the
-    new data is appended to (or replaces) what is in the Iceberg table.
+    Export entries never expire: a second export of the same partition is rejected
+    indefinitely unless export_merge_tree_partition_force_export is set.
     """
     node = cluster.instances["replica1"]
-    ttl_seconds = 3
 
     uid = unique_suffix()
     mt_table = f"mt_{uid}"
@@ -763,28 +762,31 @@ def test_export_ttl(cluster):
     # First export.
     node.query(
         f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} "
-        f"SETTINGS export_merge_tree_partition_manifest_ttl = {ttl_seconds}, allow_insert_into_iceberg = 1"
+        f"SETTINGS allow_insert_into_iceberg = 1"
     )
+    wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
+
+    count_after_first = int(node.query(f"SELECT count() FROM {iceberg_table} WHERE year = 2020").strip())
+    assert count_after_first == 3, f"Expected 3 rows after first export, got {count_after_first}"
 
-    # A second export before the TTL expires must be rejected.
+    # A second export is rejected, and stays rejected (the entry never expires).
     error = node.query_and_get_error(
         f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
         settings={"allow_insert_into_iceberg": 1},
     )
-    assert "Export with key" in error, f"Expected duplicate-export error before TTL, got: {error}"
-
-    wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
-
-    count_after_first = int(node.query(f"SELECT count() FROM {iceberg_table} WHERE year = 2020").strip())
-    assert count_after_first == 3, f"Expected 3 rows after first export, got {count_after_first}"
+    assert "Export with key" in error, f"Expected duplicate-export error, got: {error}"
 
-    # Wait for the manifest TTL to expire.
-    time.sleep(ttl_seconds * 2)
+    time.sleep(5)
+    error = node.query_and_get_error(
+        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
+        settings={"allow_insert_into_iceberg": 1},
+    )
+    assert "Export with key" in error, f"Entry must not expire, got: {error}"
 
-    # Second export must be accepted now.
+    # Only force_export can overwrite the existing entry.
     node.query(
         f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
-        settings={"allow_insert_into_iceberg": 1},
+        settings={"allow_insert_into_iceberg": 1, "export_merge_tree_partition_force_export": 1},
     )
     wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
 
@@ -888,7 +890,6 @@ def test_export_task_timeout_kills_stuck_pending_task(cluster):
             f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}"
             f" SETTINGS export_merge_tree_partition_task_timeout_seconds = 5,"
             f"          export_merge_tree_partition_max_retries = 1000000,"
-            f"          export_merge_tree_partition_manifest_ttl = 3600,"
             f"          allow_insert_into_iceberg = 1"
         )
 
diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
index d403538999df..c4e38cbff276 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
@@ -627,40 +627,43 @@ def test_inject_short_living_failures(cluster):
     assert int(exception_count.strip()) >= 1, "Expected at least one exception"
 
 
-def test_export_ttl(cluster):
+def test_export_entries_never_expire(cluster):
+    """Export entries are kept as history; a duplicate export is rejected unless forced."""
     node = cluster.instances["replica1"]
 
     postfix = str(uuid.uuid4()).replace("-", "_")
     mt_table = f"export_ttl_mt_table_{postfix}"
     s3_table = f"export_ttl_s3_table_{postfix}"
 
-    expiration_time = 3
-
     create_tables_and_insert_data(node, mt_table, s3_table, "replica1")
 
     # start export
-    node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_manifest_ttl={expiration_time};")
-
-    # assert that I get an error when trying to export the same partition again, query_and_get_error
-    error = node.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};")
-    assert "Export with key" in error, "Expected error about expired export"
-
-    # wait for the export to finish and for the manifest to expire
+    node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};")
     wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED")
-    time.sleep(expiration_time * 2)
 
-    # assert that the export succeeded, check the commit file
     assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '1\n', "Export did not succeed"
 
-    # start export again
-    node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table}")
+    # the entry never expires, so re-exporting the same partition is always rejected
+    error = node.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};")
+    assert "Export with key" in error, "Expected duplicate-export error"
 
-    # wait for the export to finish
+    # the completed entry is still present in the history table after a while
+    time.sleep(5)
+    assert node.query(
+        f"""
+        SELECT status FROM system.replicated_partition_exports
+        WHERE source_table = '{mt_table}'
+          AND destination_table = '{s3_table}'
+          AND partition_id = '2020'
+        """
+    ) == "COMPLETED\n", "Export entry should remain in history and not expire"
+
+    # only force_export can overwrite the existing entry
+    node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_force_export = 1")
     wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED")
 
-    # assert that the export succeeded, check the commit file
-    # there should be two commit files now, one for the first export and one for the second export
-    assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '2\n', "Export did not succeed"
+    # there should be two commit files now, one for each export
+    assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '2\n', "Forced re-export did not succeed"
 
 
 def test_export_partition_file_already_exists_policy(cluster):

From a676814106777b62d94c002e8754582da3ad4e71 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 16 Jun 2026 17:18:32 -0300
Subject: [PATCH 15/33] cleanup

---
 .../ExportPartitionManifestUpdatingTask.cpp   | 43 +++++++-----------
 .../test.py                                   | 45 -------------------
 .../test.py                                   | 39 ----------------
 3 files changed, 17 insertions(+), 110 deletions(-)

diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
index 6926ee77953a..8ba8d8fc64a0 100644
--- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
+++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp
@@ -130,11 +130,9 @@ namespace
     /*
         Enforce the PENDING task timeout and recover non-committed exports that have already
         exported all parts. Entries are never removed for age — `system.replicated_partition_exports`
-        is append-only history.
-
-        Always returns false: the entry stays in the in-memory container. A KILLED transition is
-        driven by the status watch, and a deferred commit is handled by the caller after the lock
-        is released.
+        is append-only history, so the entry always stays in the in-memory container: a KILLED
+        transition is driven by the status watch, and a deferred commit is handled by the caller
+        after the lock is released.
 
         Side outputs:
         - `deferred_commits`: when a PENDING entry has all parts processed but the export was
@@ -142,7 +140,7 @@ namespace
           the caller after releasing the storage-wide mutex. The actual commit() call (which
           performs network I/O to the destination catalog and S3) MUST NOT run under the lock.
     */
-    bool tryCleanup(
+    void tryCleanup(
         const zkutil::ZooKeeperPtr & zk,
         const std::string & entry_path,
         const LoggerPtr & log,
@@ -170,14 +168,14 @@ namespace
             if (!zk->tryGet(status_path, status_string, &status_stat))
             {
                 LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to read status for {} while enforcing task timeout, skipping", entry_path);
-                return false;
+                return;
             }
 
             const auto current_status = magic_enum::enum_cast<ExportReplicatedMergeTreePartitionTaskEntry::Status>(status_string);
             if (!current_status || *current_status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING)
             {
                 LOG_INFO(log, "ExportPartition Manifest Updating Task: Task {} is not PENDING, can't set to KILLED, skipping", entry_path);
-                return false;
+                return;
             }
 
             const auto timeout_message = fmt::format(
@@ -217,9 +215,9 @@ namespace
                     entry_path, rc);
             }
 
-            /// Return false so the entry remains in entries_by_key; the status watch will drive
+            /// The entry remains in entries_by_key; the status watch will drive
             /// handleStatusChanges -> killExportPart on every replica, mirroring user-initiated KILL.
-            return false;
+            return;
         }
         else if (is_pending)
         {
@@ -232,7 +230,7 @@ namespace
             {
 
                 LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to get parts in processing or pending, skipping");
-                return false;
+                return;
             }
 
             if (parts_in_processing_or_pending.empty())
@@ -244,7 +242,7 @@ namespace
                 if (!destination_storage)
                 {
                     LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to reconstruct destination storage: {}, skipping", destination_storage_id.getNameForLogs());
-                    return false;
+                    return;
                 }
 
                 /// A replica exported the last part but the commit never landed. Capture everything
@@ -253,22 +251,18 @@ namespace
                 /// MAX_TRANSACTION_RETRIES = 100 retries; holding the storage-wide mutex across
                 /// that work is what caused `system.replicated_partition_exports` to hang.
                 ///
-                /// Returning false here keeps the outer poll() loop on the normal path: it will
-                /// call addTask() so the in-memory container reflects the PENDING entry. The
-                /// status watch registered by poll() will transition the local entry to
-                /// COMPLETED/FAILED once the deferred commit (or a peer's commit) updates
-                /// /status in ZooKeeper.
+                /// The outer poll() loop stays on the normal path: it will call addTask() so the
+                /// in-memory container reflects the PENDING entry. The status watch registered by
+                /// poll() will transition the local entry to COMPLETED/FAILED once the deferred
+                /// commit (or a peer's commit) updates /status in ZooKeeper.
                 deferred_commits.push_back(CommitRecoveryWork{
                     .metadata = metadata,
                     .entry_path = entry_path,
                     .destination_storage = destination_storage,
                     .context = context,
                 });
-                return false;
             }
         }
-
-        return false;
     }
 }
 
@@ -451,11 +445,11 @@ void ExportPartitionManifestUpdatingTask::poll()
                 continue;
             }
 
-            /// if we have the cleanup lock, try to cleanup
-            /// if we successfully cleaned it up, early exit
+            /// If we hold the cleanup lock, enforce the task timeout and recover uncommitted exports.
+            /// Entries are never removed here, so we always fall through to refresh / addTask below.
             if (cleanup_lock)
             {
-                bool cleanup_successful = tryCleanup(
+                tryCleanup(
                     zk,
                     entry_path,
                     storage.log.load(),
@@ -465,9 +459,6 @@ void ExportPartitionManifestUpdatingTask::poll()
                     now,
                     *status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING,
                     deferred_commits);
-
-                if (cleanup_successful)
-                    continue;
             }
 
             if (has_local_entry_and_is_up_to_date)
diff --git a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
index 89ceeca47003..62a8a2196313 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_iceberg/test.py
@@ -746,51 +746,6 @@ def test_partition_key_compatibility_check(cluster):
     )
 
 
-def test_export_entries_never_expire(cluster):
-    """
-    Export entries never expire: a second export of the same partition is rejected
-    indefinitely unless export_merge_tree_partition_force_export is set.
-    """
-    node = cluster.instances["replica1"]
-
-    uid = unique_suffix()
-    mt_table = f"mt_{uid}"
-    iceberg_table = f"iceberg_{uid}"
-
-    setup_tables(cluster, mt_table, iceberg_table, nodes=["replica1"])
-
-    # First export.
-    node.query(
-        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table} "
-        f"SETTINGS allow_insert_into_iceberg = 1"
-    )
-    wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
-
-    count_after_first = int(node.query(f"SELECT count() FROM {iceberg_table} WHERE year = 2020").strip())
-    assert count_after_first == 3, f"Expected 3 rows after first export, got {count_after_first}"
-
-    # A second export is rejected, and stays rejected (the entry never expires).
-    error = node.query_and_get_error(
-        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
-        settings={"allow_insert_into_iceberg": 1},
-    )
-    assert "Export with key" in error, f"Expected duplicate-export error, got: {error}"
-
-    time.sleep(5)
-    error = node.query_and_get_error(
-        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
-        settings={"allow_insert_into_iceberg": 1},
-    )
-    assert "Export with key" in error, f"Entry must not expire, got: {error}"
-
-    # Only force_export can overwrite the existing entry.
-    node.query(
-        f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {iceberg_table}",
-        settings={"allow_insert_into_iceberg": 1, "export_merge_tree_partition_force_export": 1},
-    )
-    wait_for_export_status(node, mt_table, iceberg_table, "2020", "COMPLETED")
-
-
 def test_export_data_files_are_not_cleaned_up_on_commit_failure(cluster):
     """
     Verify that the data files are not cleaned up on commit failure and the export is retried.
diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
index c4e38cbff276..8ad265a375ad 100644
--- a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
+++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py
@@ -627,45 +627,6 @@ def test_inject_short_living_failures(cluster):
     assert int(exception_count.strip()) >= 1, "Expected at least one exception"
 
 
-def test_export_entries_never_expire(cluster):
-    """Export entries are kept as history; a duplicate export is rejected unless forced."""
-    node = cluster.instances["replica1"]
-
-    postfix = str(uuid.uuid4()).replace("-", "_")
-    mt_table = f"export_ttl_mt_table_{postfix}"
-    s3_table = f"export_ttl_s3_table_{postfix}"
-
-    create_tables_and_insert_data(node, mt_table, s3_table, "replica1")
-
-    # start export
-    node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};")
-    wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED")
-
-    assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '1\n', "Export did not succeed"
-
-    # the entry never expires, so re-exporting the same partition is always rejected
-    error = node.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};")
-    assert "Export with key" in error, "Expected duplicate-export error"
-
-    # the completed entry is still present in the history table after a while
-    time.sleep(5)
-    assert node.query(
-        f"""
-        SELECT status FROM system.replicated_partition_exports
-        WHERE source_table = '{mt_table}'
-          AND destination_table = '{s3_table}'
-          AND partition_id = '2020'
-        """
-    ) == "COMPLETED\n", "Export entry should remain in history and not expire"
-
-    # only force_export can overwrite the existing entry
-    node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_force_export = 1")
-    wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED")
-
-    # there should be two commit files now, one for each export
-    assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '2\n', "Forced re-export did not succeed"
-
-
 def test_export_partition_file_already_exists_policy(cluster):
     node = cluster.instances["replica1"]
 

From 16dac3cf25c8c6616945cacb215da10dc1b03111 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 16 Jun 2026 20:57:30 -0300
Subject: [PATCH 16/33] missing piece

---
 .../design/alter-table-export-part-partition.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/antalya/docs/design/alter-table-export-part-partition.md b/antalya/docs/design/alter-table-export-part-partition.md
index d513e4b80e41..8ebe237c634a 100644
--- a/antalya/docs/design/alter-table-export-part-partition.md
+++ b/antalya/docs/design/alter-table-export-part-partition.md
@@ -422,12 +422,13 @@ The following notes expand on expected behavior of commands.
    every active part of partition `p` across all replicas that host
    it; `system.replicated_partition_exports` converges to `COMPLETED`.
 
-4. Re-issuing the same `EXPORT PARTITION` within
-   `export_merge_tree_partition_manifest_ttl` is a no-op (no
-   duplicate files) unless `export_merge_tree_partition_force_export = 1`. This
-   behavior avoids accidentally exporting the same data twice. Note, however
-   that forcing the operation is dangerous if ClickHouse can't clean up the
-   previous operation. In this case you'll potentially commit files twice. 
+4. Re-issuing the same `EXPORT PARTITION` is rejected (no duplicate
+   files) unless `export_merge_tree_partition_force_export = 1`. Export
+   entries are kept indefinitely in `system.replicated_partition_exports`
+   as history; they never expire. This behavior avoids accidentally
+   exporting the same data twice. Note, however that forcing the
+   operation is dangerous if ClickHouse can't clean up the previous
+   operation. In this case you'll potentially commit files twice. 
 
 5. Killing an in-flight partition export via `KILL EXPORT PARTITION`
    transitions status to `KILLED` and stops all replicas' contributions.
@@ -481,7 +482,6 @@ The following notes expand on expected behavior of commands.
 | `export_merge_tree_part_filename_pattern` | query | `{part_name}_{checksum}` | `String` | both | Filename template; supports `{part_name}`, `{checksum}`, `{database}`, `{table}`, server macros. |
 | `export_merge_tree_partition_force_export` | query | `false` | `Bool` | `EXPORT PARTITION` | Overwrite a live Keeper manifest for the same `(source, destination, partition_id)`. Dangerous — can produce duplicate data on the destination; use with caution. |
 | `export_merge_tree_partition_max_retries` | query | `3` | `UInt64` | `EXPORT PARTITION` | Retry budget applied to both per-part export attempts and per-task commit attempts (Iceberg). The task fails terminally if commit retries alone exceed the budget. |
-| `export_merge_tree_partition_manifest_ttl` | query | `180` (seconds) | `UInt64` | `EXPORT PARTITION` | Live-manifest TTL; acts as the idempotency window. Does not interrupt in-flight tasks. Keep this greater than `export_merge_tree_partition_task_timeout_seconds` if you want the `KILLED` entry to remain visible in `system.replicated_partition_exports` after the timeout fires. |
 | `export_merge_tree_partition_task_timeout_seconds` | query | `3600` (seconds) | `UInt64` (`0`=disable) | `EXPORT PARTITION` | Wall-clock cap for `PENDING` tasks; on expiry transitions to `KILLED` with a timeout reason. Measured from manifest `create_time`. Enforcement latency ≈ one manifest-updater poll cycle (~30s) plus Keeper watch propagation. |
 | `export_merge_tree_partition_system_table_prefer_remote_information` | query | `false` | `Bool` | `EXPORT PARTITION` | When `true`, `system.replicated_partition_exports` fetches fresh state from Keeper (requires the `MULTI_READ` feature flag); when `false`, uses local cached state. **Default flipped from `true` to `false` in this release** — Keeper round-trips were more expensive than warranted for the typical observability workload. (See NOTE 2.)|
 | `export_merge_tree_part_file_already_exists_policy` | query | `skip` | `skip` / `error` / `overwrite` | `EXPORT PARTITION` | Per-file policy during partition export. |
@@ -722,7 +722,8 @@ peak memory. Hot path is the Parquet encoder, which warrants a guard against reg
   (`parts_to_do > 0`) rather than corrupt data. Acceptable but must be documented in the
   upgrade notes.
 - **Risk (object-storage cost / accidental large exports):** mitigated by the experimental
-  gates (default off) and the manifest idempotency window.
+  gates (default off) and the duplicate-export rejection (an existing export key is refused
+  unless `export_merge_tree_partition_force_export` is set).
 - **Risk (Iceberg catalog manifest retention):** if the catalog reaps old manifest files
   with a retention window shorter than `export_merge_tree_partition_task_timeout_seconds`,
   the rare "sole-node commits, crashes, recovers after reaper deleted the commit manifest"

From 978f9e172f5b9df86e529b017f6c5bff6802087d Mon Sep 17 00:00:00 2001
From: CarlosFelipeOR <carlosfelipeor@gmail.com>
Date: Thu, 18 Jun 2026 17:39:19 -0300
Subject: [PATCH 17/33] Skip store_data.py pre-hook in MasterCI to avoid Finish
 Workflow template-size overflow

Signed-off-by: CarlosFelipeOR <carlosfelipeor@gmail.com>
---
 ci/workflows/master.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/workflows/master.py b/ci/workflows/master.py
index cb67d223d13c..e6e9bbdeac88 100644
--- a/ci/workflows/master.py
+++ b/ci/workflows/master.py
@@ -101,7 +101,7 @@
     enable_commit_status_on_failure=True,
     enable_slack_feed=False,
     pre_hooks=[
-        "python3 ./ci/jobs/scripts/workflow_hooks/store_data.py",
+        # "python3 ./ci/jobs/scripts/workflow_hooks/store_data.py", # NOTE (carlosfelipeor): we don't use this in master CI
         "python3 ./ci/jobs/scripts/workflow_hooks/version_log.py",
         "python3 ./ci/jobs/scripts/workflow_hooks/parse_ci_tags.py",
         # "python3 ./ci/jobs/scripts/workflow_hooks/merge_sync_pr.py", # NOTE (strtgbb): we don't do this

From 03227749150845333ad362aca0961a5bf08c4b5b Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Fri, 27 Mar 2026 13:41:38 -0400
Subject: [PATCH 18/33] Add 'PRs in Release' table to report

---
 .../ci_run_report.html.jinja                  |   9 +
 .../create_workflow_report.py                 | 311 +++++++++++++++---
 2 files changed, 279 insertions(+), 41 deletions(-)

diff --git a/.github/actions/create_workflow_report/ci_run_report.html.jinja b/.github/actions/create_workflow_report/ci_run_report.html.jinja
index 4c94465a16c6..a6e0df17c49f 100644
--- a/.github/actions/create_workflow_report/ci_run_report.html.jinja
+++ b/.github/actions/create_workflow_report/ci_run_report.html.jinja
@@ -168,6 +168,7 @@
   {% endif %}
   <h2>Table of Contents</h2>
   <ul>
+    {%- if pr_number == 0 %}<li><a href="#prs-in-release">PRs in Release</a> ({{ counts.prs_in_release }})</li>{% endif %}
     {%- if pr_number != 0 %}<li><a href="#new-fails-pr">New Fails in PR</a> ({{ counts.pr_new_fails }})</li>{% endif %}
     <li><a href="#ci-jobs-status">CI Jobs Status</a> ({{ counts.jobs_status }})</li>
     <li><a href="#checks-errors">Checks Errors</a> ({{ counts.checks_errors }})</li>
@@ -177,6 +178,14 @@
     <li><a href="#checks-known-fails">Checks Known Fails</a> ({{ counts.checks_known_fails }})</li>
   </ul>
 
+  {%- if pr_number == 0 -%}
+  <h2 id="prs-in-release">PRs in Release</h2>
+  {% if prs_in_release_missing_verification %}
+  <p style="font-weight: bold;color: red;">Some PRs are missing verification.</p>
+  {% endif %}
+  {{ prs_in_release_html }}
+  {%- endif %}
+
   {%- if pr_number != 0 -%}
   <h2 id="new-fails-pr">New Fails in PR</h2>
   <p> Compared with base sha {{ base_sha }} </p>
diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py
index 8daf0fc24f31..a4f43e172459 100755
--- a/.github/actions/create_workflow_report/create_workflow_report.py
+++ b/.github/actions/create_workflow_report/create_workflow_report.py
@@ -6,10 +6,12 @@
 from itertools import combinations
 import json
 from datetime import datetime
+from datetime import timezone
 from functools import lru_cache
 from glob import glob
 import urllib.parse
 import re
+import subprocess
 
 import pandas as pd
 from jinja2 import Environment, FileSystemLoader
@@ -169,6 +171,196 @@ def get_run_details(run_id: str) -> dict:
     return response.json()
 
 
+def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.DataFrame, bool]:
+    if len(df) == 0:
+        return pd.DataFrame(columns=["pr_number", "pr_name", "labels"]), False
+    if not GITHUB_TOKEN:
+        raise Exception("GITHUB_TOKEN is required to fetch PR titles and labels")
+    headers = {
+        "Authorization": f"token {GITHUB_TOKEN}",
+        "Accept": "application/vnd.github.v3+json",
+    }
+    rows = []
+    missing_verification = False
+    for pr_number in df["pr_number"].tolist():
+        response = requests.get(
+            f"https://api.github.com/repos/{repo}/pulls/{pr_number}",
+            headers=headers,
+        )
+        if response.status_code != 200:
+            raise Exception(
+                f"Failed to fetch pull request info: {response.status_code} {response.text}"
+            )
+        pr = response.json()
+        label_names = [l["name"] for l in pr.get("labels", [])]
+        if any(ln.lower() == "cicd" for ln in label_names):
+            continue
+        lowered = {ln.lower() for ln in label_names}
+        if "verified" not in lowered and "verified-with-issue" not in lowered:
+            missing_verification = True
+        rows.append(
+            {
+                "pr_number": pr_number,
+                "pr_name": pr.get("title", ""),
+                "labels": ", ".join(sorted(label_names)),
+            }
+        )
+    return pd.DataFrame(rows), missing_verification
+
+
+def _git_rev_parse(ref: str, cwd: str | None) -> str | None:
+    p = subprocess.run(
+        ["git", "rev-parse", "--verify", ref],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+    )
+    if p.returncode != 0:
+        return None
+    return p.stdout.strip()
+
+
+def _git_is_ancestor(ancestor: str, descendant: str, cwd: str | None) -> bool:
+    p = subprocess.run(
+        ["git", "merge-base", "--is-ancestor", ancestor, descendant],
+        cwd=cwd,
+        capture_output=True,
+    )
+    return p.returncode == 0
+
+
+def _git_log_merge_prs(
+    baseline: str, branch_ref: str, cwd: str | None, repo: str
+) -> pd.DataFrame:
+    p = subprocess.run(
+        [
+            "git",
+            "-c",
+            "core.quotepath=false",
+            "log",
+            f"{baseline}..{branch_ref}",
+            "--merges",
+            "--format=%H%x09%s",
+        ],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    rows = []
+    for line in p.stdout.splitlines():
+        if not line.strip():
+            continue
+        sha, subject = line.split("\t", 1)
+        m = re.match(
+            r"Merge pull request #(\d+) from ([^/\s]+)/", subject, re.IGNORECASE
+        )
+        if not m:
+            continue
+        pr_number, head_owner = int(m.group(1)), m.group(2)
+        if head_owner.lower() != repo.split("/")[0].lower():
+            continue
+        rows.append(
+            {
+                "pr_number": pr_number,
+                "merge_commit_sha": sha,
+                "merge_subject": subject,
+            }
+        )
+    if not rows:
+        return pd.DataFrame(columns=["pr_number", "merge_commit_sha", "merge_subject"])
+    df = pd.DataFrame(rows)
+    df = df.drop_duplicates(subset=["pr_number"], keep="first")
+    return df
+
+
+def _find_release_baseline(
+    branch_ref: str, repo: str, cwd: str | None
+) -> tuple[str | None, str | None]:
+    if not GITHUB_TOKEN:
+        return None, None
+    headers = {
+        "Authorization": f"token {GITHUB_TOKEN}",
+        "Accept": "application/vnd.github.v3+json",
+    }
+    response = requests.get(
+        f"https://api.github.com/repos/{repo}/releases?per_page=100",
+        headers=headers,
+    )
+    if response.status_code != 200:
+        raise Exception(
+            f"GitHub API request failed: {response.status_code} {response.text}"
+        )
+    for rel in response.json():
+        if rel.get("draft"):
+            continue
+        tag_name = rel.get("tag_name")
+        if not tag_name:
+            continue
+        tag_sha = _git_rev_parse(tag_name, cwd)
+        if not tag_sha:
+            continue
+        if not _git_is_ancestor(tag_sha, branch_ref, cwd):
+            continue
+        return tag_name, tag_sha
+    return None, None
+
+
+def _find_rebase_baseline(branch_ref: str, cwd: str | None) -> str | None:
+    p = subprocess.run(
+        [
+            "git",
+            "log",
+            branch_ref,
+            "--reverse",
+            "--merges",
+            "-i",
+            "--grep=rebase-cicd",
+            "--grep=rebase/",
+            "--format=%H",
+        ],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+    )
+    if p.returncode != 0:
+        return None
+    lines = [ln for ln in p.stdout.splitlines() if ln.strip()]
+    if not lines:
+        return None
+    return lines[0]
+
+
+def get_prs_in_release_dataframe(
+    branch_ref: str = "HEAD",
+    *,
+    repo: str = GITHUB_REPO,
+    cwd: str | None = None,
+) -> tuple[pd.DataFrame, bool]:
+    """
+    PRs merged into branch_ref that belong in the next release notes: after the latest GitHub
+    Release tag on this history, or after the oldest rebase bootstrap if no such tag exists.
+    Only merge commits whose subject has from <repo_owner>/ (e.g. from Altinity/) are included.
+    Columns: pr_number, pr_name, labels. Omits PRs labeled cicd.
+    Second value is True if any listed PR lacks verified or verified-with-issue.
+    """
+    branch_sha = _git_rev_parse(branch_ref, cwd)
+    if not branch_sha:
+        raise Exception(f"Cannot resolve branch ref: {branch_ref!r}")
+
+    baseline_ref, baseline_sha = _find_release_baseline(branch_ref, repo, cwd)
+    if not baseline_sha:
+        rebase_sha = _find_rebase_baseline(branch_ref, cwd)
+        if not rebase_sha:
+            raise Exception(
+                "No release tag on this branch and no rebase bootstrap merge found"
+            )
+        baseline_sha = rebase_sha
+
+    df = _git_log_merge_prs(baseline_sha, branch_ref, cwd, repo)
+    return _enrich_prs_in_release_merge_prs(df, repo)
+
+
 def _checks_latest_test_status_cte(commit_sha: str, branch_name: str) -> str:
     """
     Shared filtering for gh-data.checks: anchor time excludes stateless teardown checks
@@ -622,25 +814,40 @@ def format_test_status(text: str) -> str:
 
 
 def format_results_as_html_table(results) -> str:
+    if not isinstance(results, pd.DataFrame):
+        return results
+
     if len(results) == 0:
         return "<p>Nothing to report</p>"
-    results.columns = [col.replace("_", " ").title() for col in results.columns]
+
+    results = results.copy()
+
+    def format_col_name(col_name: str) -> str:
+        return col_name.replace("_", " ").title().replace("Pr ", "PR ")
+
+    results.columns = [format_col_name(col) for col in results.columns]
+
+    formatters = {
+        "Results Link": url_to_html_link,
+        "Test Name": format_test_name_for_linewrap,
+        "Test Status": format_test_status,
+        "Job Status": format_test_status,
+        "Status": format_test_status,
+        "Message": lambda m: m.replace("\n", " "),
+        "Identifier": lambda i: url_to_html_link(
+            "https://nvd.nist.gov/vuln/detail/" + i
+        ),
+        "Severity": lambda s: (
+            f'<span data-sort="{CVE_SEVERITY_ORDER.get(str(s).lower(), 6)}">{s}</span>'
+        ),
+        "PR Number": lambda n: url_to_html_link(
+            f"https://github.com/{GITHUB_REPO}/pull/{n}"
+        ),
+    }
+
     html = results.to_html(
         index=False,
-        formatters={
-            "Results Link": url_to_html_link,
-            "Test Name": format_test_name_for_linewrap,
-            "Test Status": format_test_status,
-            "Job Status": format_test_status,
-            "Status": format_test_status,
-            "Message": lambda m: m.replace("\n", " "),
-            "Identifier": lambda i: url_to_html_link(
-                "https://nvd.nist.gov/vuln/detail/" + i
-            ),
-            "Severity": lambda s: (
-                f'<span data-sort="{CVE_SEVERITY_ORDER.get(str(s).lower(), 6)}">{s}</span>'
-            ),
-        },
+        formatters=formatters,
         escape=False,
         border=0,
         classes=["test-results-table"],
@@ -816,7 +1023,9 @@ def create_workflow_report(
         settings={"use_numpy": True},
     )
 
-    fail_results = {
+    prs_in_release_missing_verification = False
+    results_dfs = {
+        "prs_in_release": [],
         "job_statuses": get_commit_statuses(commit_sha),
         "checks_fails": get_checks_fails(db_client, commit_sha, branch_name),
         "checks_known_fails": [],
@@ -826,8 +1035,17 @@ def create_workflow_report(
         "docker_images_cves": [],
     }
 
+    if pr_number == 0 and not mark_preview:
+        try:
+            prs_df, prs_in_release_missing_verification = get_prs_in_release_dataframe(
+                branch_name, cwd=os.getcwd()
+            )
+            results_dfs["prs_in_release"] = prs_df
+        except Exception as e:
+            print(f"Error in get_prs_in_release_dataframe: {e}")
+
     try:
-        fail_results["docker_images_cves"] = (
+        results_dfs["docker_images_cves"] = (
             [] if not check_cves else get_cves(pr_number, commit_sha, branch_name)
         )
     except Exception as e:
@@ -835,7 +1053,7 @@ def create_workflow_report(
 
     # get_cves returns ... in the case where no Grype result files were found.
     # This might occur when run in preview mode.
-    cves_not_checked = not check_cves or fail_results["docker_images_cves"] is ...
+    cves_not_checked = not check_cves or results_dfs["docker_images_cves"] is ...
 
     if known_fails_file_path:
         if not os.path.exists(known_fails_file_path):
@@ -843,7 +1061,7 @@ def create_workflow_report(
         else:
             known_fails = get_broken_tests_rules(known_fails_file_path)
 
-            fail_results["checks_known_fails"] = get_checks_known_fails(
+            results_dfs["checks_known_fails"] = get_checks_known_fails(
                 db_client, commit_sha, branch_name, known_fails
             )
 
@@ -855,24 +1073,24 @@ def create_workflow_report(
             pr_info_html = f"""<a href="https://github.com/{GITHUB_REPO}/pull/{pr_info["number"]}">
                     #{pr_info.get("number")} ({pr_info.get("base", {}).get('ref')} <- {pr_info.get("head", {}).get('ref')})  {pr_info.get("title")}
                     </a>"""
-            fail_results["pr_new_fails"] = get_new_fails_this_pr(
+            results_dfs["pr_new_fails"] = get_new_fails_this_pr(
                 db_client,
                 pr_info,
-                fail_results["checks_fails"],
-                fail_results["regression_fails"],
+                results_dfs["checks_fails"],
+                results_dfs["regression_fails"],
             )
         except Exception as e:
             pr_info_html = e
             pr_info = {}
 
-    fail_results["job_statuses"] = backfill_skipped_statuses(
-        fail_results["job_statuses"], pr_number, branch_name, commit_sha
+    results_dfs["job_statuses"] = backfill_skipped_statuses(
+        results_dfs["job_statuses"], pr_number, branch_name, commit_sha
     )
 
     high_cve_count = 0
-    if not cves_not_checked and len(fail_results["docker_images_cves"]) > 0:
+    if not cves_not_checked and len(results_dfs["docker_images_cves"]) > 0:
         high_cve_count = (
-            fail_results["docker_images_cves"]["severity"]
+            results_dfs["docker_images_cves"]["severity"]
             .str.lower()
             .isin(("high", "critical"))
             .sum()
@@ -893,43 +1111,54 @@ def create_workflow_report(
         "workflow_id": run_id,
         "commit_sha": commit_sha,
         "base_sha": "" if pr_number == 0 else pr_info.get("base", {}).get("sha"),
-        "date": f"{datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC",
+        "date": f"{datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC",
         "is_preview": mark_preview,
+        "prs_in_release_missing_verification": prs_in_release_missing_verification,
         "counts": {
-            "jobs_status": f"{sum(fail_results['job_statuses']['job_status'].value_counts().get(x, 0) for x in ('failure', 'error'))} fail/error",
-            "checks_errors": len(fail_results["checks_errors"]),
-            "checks_new_fails": len(fail_results["checks_fails"]),
-            "regression_new_fails": len(fail_results["regression_fails"]),
+            "jobs_status": f"{sum(results_dfs['job_statuses']['job_status'].value_counts().get(x, 0) for x in ('failure', 'error'))} fail/error",
+            "checks_errors": len(results_dfs["checks_errors"]),
+            "checks_new_fails": len(results_dfs["checks_fails"]),
+            "regression_new_fails": len(results_dfs["regression_fails"]),
             "cves": "N/A" if cves_not_checked else f"{high_cve_count} high/critical",
             "checks_known_fails": (
-                "N/A" if not known_fails else len(fail_results["checks_known_fails"])
+                "N/A" if not known_fails else len(results_dfs["checks_known_fails"])
+            ),
+            "pr_new_fails": len(results_dfs["pr_new_fails"]),
+            "prs_in_release": (
+                "N/A"
+                if mark_preview or pr_number != 0
+                else len(results_dfs["prs_in_release"])
             ),
-            "pr_new_fails": len(fail_results["pr_new_fails"]),
         },
         "build_report_links": get_build_report_links(
-            fail_results["job_statuses"], pr_number, branch_name, commit_sha
+            results_dfs["job_statuses"], pr_number, branch_name, commit_sha
+        ),
+        "prs_in_release_html": (
+            "<p>PR details are not loaded during preview.</p>"
+            if mark_preview or pr_number != 0
+            else format_results_as_html_table(results_dfs["prs_in_release"])
         ),
         "ci_jobs_status_html": format_results_as_html_table(
-            fail_results["job_statuses"]
+            results_dfs["job_statuses"]
         ),
         "checks_errors_html": format_results_as_html_table(
-            fail_results["checks_errors"]
+            results_dfs["checks_errors"]
         ),
-        "checks_fails_html": format_results_as_html_table(fail_results["checks_fails"]),
+        "checks_fails_html": format_results_as_html_table(results_dfs["checks_fails"]),
         "regression_fails_html": format_results_as_html_table(
-            fail_results["regression_fails"]
+            results_dfs["regression_fails"]
         ),
         "docker_images_cves_html": (
             "<p>Not Checked</p>"
             if cves_not_checked
-            else format_results_as_html_table(fail_results["docker_images_cves"])
+            else format_results_as_html_table(results_dfs["docker_images_cves"])
         ),
         "checks_known_fails_html": (
             "<p>Not Checked</p>"
             if not known_fails
-            else format_results_as_html_table(fail_results["checks_known_fails"])
+            else format_results_as_html_table(results_dfs["checks_known_fails"])
         ),
-        "new_fails_html": format_results_as_html_table(fail_results["pr_new_fails"]),
+        "new_fails_html": format_results_as_html_table(results_dfs["pr_new_fails"]),
     }
 
     # Render the template with the context

From c175440d692047c4fbcedac35d1c4462321570c2 Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Fri, 27 Mar 2026 16:37:14 -0400
Subject: [PATCH 19/33] highlight labels for unverified PRs

---
 .../create_workflow_report.py                 | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py
index a4f43e172459..505872d6260b 100755
--- a/.github/actions/create_workflow_report/create_workflow_report.py
+++ b/.github/actions/create_workflow_report/create_workflow_report.py
@@ -32,6 +32,8 @@
 
 CVE_SEVERITY_ORDER = {"critical": 1, "high": 2, "medium": 3, "low": 4, "negligible": 5}
 
+PR_LABELS_VERIFIED = {"verified", "verified-with-issue", "no-verification-needed"}
+
 def _is_clickhouse_memory_limit_error(exc: BaseException) -> bool:
     if isinstance(exc, ServerException) and getattr(exc, "code", None) == 241:
         return True
@@ -173,7 +175,7 @@ def get_run_details(run_id: str) -> dict:
 
 def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.DataFrame, bool]:
     if len(df) == 0:
-        return pd.DataFrame(columns=["pr_number", "pr_name", "labels"]), False
+        return pd.DataFrame(columns=["pr_number", "pr_name", "pr_labels"]), False
     if not GITHUB_TOKEN:
         raise Exception("GITHUB_TOKEN is required to fetch PR titles and labels")
     headers = {
@@ -193,16 +195,15 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.Da
             )
         pr = response.json()
         label_names = [l["name"] for l in pr.get("labels", [])]
-        if any(ln.lower() == "cicd" for ln in label_names):
+        if "cicd" in label_names:
             continue
-        lowered = {ln.lower() for ln in label_names}
-        if "verified" not in lowered and "verified-with-issue" not in lowered:
+        if not PR_LABELS_VERIFIED.intersection(label_names):
             missing_verification = True
         rows.append(
             {
                 "pr_number": pr_number,
                 "pr_name": pr.get("title", ""),
-                "labels": ", ".join(sorted(label_names)),
+                "pr_labels": ", ".join(sorted(label_names)),
             }
         )
     return pd.DataFrame(rows), missing_verification
@@ -337,12 +338,12 @@ def get_prs_in_release_dataframe(
     repo: str = GITHUB_REPO,
     cwd: str | None = None,
 ) -> tuple[pd.DataFrame, bool]:
-    """
+    f"""
     PRs merged into branch_ref that belong in the next release notes: after the latest GitHub
     Release tag on this history, or after the oldest rebase bootstrap if no such tag exists.
     Only merge commits whose subject has from <repo_owner>/ (e.g. from Altinity/) are included.
     Columns: pr_number, pr_name, labels. Omits PRs labeled cicd.
-    Second value is True if any listed PR lacks verified or verified-with-issue.
+    Second value is True if any listed PR lacks verified labels.
     """
     branch_sha = _git_rev_parse(branch_ref, cwd)
     if not branch_sha:
@@ -794,6 +795,15 @@ def url_to_html_link(url: str) -> str:
     return f'<a href="{url}">{text}</a>'
 
 
+def format_pr_labels_with_verification(labels: str) -> str:
+    """Format the PR labels with verification."""
+    labels_list = labels.split(", ")
+    if PR_LABELS_VERIFIED.intersection(labels_list):
+        return labels
+    else:
+        return f'<span style="font-weight: bold; color: red;">{labels} (missing verification)</span>'
+
+
 def format_test_name_for_linewrap(text: str) -> str:
     """Tweak the test name to improve line wrapping."""
     return f'<span style="line-break: anywhere;">{text}</span>'
@@ -843,6 +853,7 @@ def format_col_name(col_name: str) -> str:
         "PR Number": lambda n: url_to_html_link(
             f"https://github.com/{GITHUB_REPO}/pull/{n}"
         ),
+        "PR Labels": format_pr_labels_with_verification,
     }
 
     html = results.to_html(

From a7113d0e0a3345ee8ebc3ae683a2407c003c9b42 Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Tue, 31 Mar 2026 11:43:21 -0400
Subject: [PATCH 20/33] report: fix _find_rebase_baseline

---
 .../create_workflow_report/create_workflow_report.py       | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py
index 505872d6260b..bade4193aaac 100755
--- a/.github/actions/create_workflow_report/create_workflow_report.py
+++ b/.github/actions/create_workflow_report/create_workflow_report.py
@@ -314,10 +314,11 @@ def _find_rebase_baseline(branch_ref: str, cwd: str | None) -> str | None:
             "log",
             branch_ref,
             "--reverse",
-            "--merges",
             "-i",
-            "--grep=rebase-cicd",
-            "--grep=rebase/",
+            "-E",
+            "--grep=^Rebase CICD",
+            "--grep=Merge pull request .*rebase-cicd",
+            "--grep=Merge pull request .*from Altinity/rebase/",
             "--format=%H",
         ],
         cwd=cwd,

From 458420e92b659310166826514d161ab1311e610f Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Tue, 31 Mar 2026 20:47:20 -0400
Subject: [PATCH 21/33] ensure extra git history is fetched in
 get_prs_in_release_dataframe

---
 .../create_workflow_report.py                 | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py
index bade4193aaac..778452ef0a6a 100755
--- a/.github/actions/create_workflow_report/create_workflow_report.py
+++ b/.github/actions/create_workflow_report/create_workflow_report.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import os
+import sys
 import time
 from pathlib import Path
 from itertools import combinations
@@ -337,7 +338,7 @@ def get_prs_in_release_dataframe(
     branch_ref: str = "HEAD",
     *,
     repo: str = GITHUB_REPO,
-    cwd: str | None = None,
+    cwd: str,
 ) -> tuple[pd.DataFrame, bool]:
     f"""
     PRs merged into branch_ref that belong in the next release notes: after the latest GitHub
@@ -350,8 +351,25 @@ def get_prs_in_release_dataframe(
     if not branch_sha:
         raise Exception(f"Cannot resolve branch ref: {branch_ref!r}")
 
+    # CI often checks out with --depth=1 and --no-tags; fetch enough history and tags once.
+    subprocess.run(
+        ["git", "fetch", "--tags", "origin"],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+
     baseline_ref, baseline_sha = _find_release_baseline(branch_ref, repo, cwd)
     if not baseline_sha:
+        # If no release tag, try to find rebase commit 200 commits back.
+        subprocess.run(
+            ["git", "fetch", "--deepen=200", "origin", branch_ref],
+            cwd=cwd,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
         rebase_sha = _find_rebase_baseline(branch_ref, cwd)
         if not rebase_sha:
             raise Exception(

From 27215e627360393b9198cab5496ea8f117d1a1c3 Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Wed, 1 Apr 2026 09:53:34 -0400
Subject: [PATCH 22/33] no longer consider labels no-verification-needed and
 cicd

---
 .../actions/create_workflow_report/create_workflow_report.py  | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py
index 778452ef0a6a..1eddb30b83eb 100755
--- a/.github/actions/create_workflow_report/create_workflow_report.py
+++ b/.github/actions/create_workflow_report/create_workflow_report.py
@@ -33,7 +33,7 @@
 
 CVE_SEVERITY_ORDER = {"critical": 1, "high": 2, "medium": 3, "low": 4, "negligible": 5}
 
-PR_LABELS_VERIFIED = {"verified", "verified-with-issue", "no-verification-needed"}
+PR_LABELS_VERIFIED = {"verified", "verified-with-issue"}
 
 def _is_clickhouse_memory_limit_error(exc: BaseException) -> bool:
     if isinstance(exc, ServerException) and getattr(exc, "code", None) == 241:
@@ -196,8 +196,6 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.Da
             )
         pr = response.json()
         label_names = [l["name"] for l in pr.get("labels", [])]
-        if "cicd" in label_names:
-            continue
         if not PR_LABELS_VERIFIED.intersection(label_names):
             missing_verification = True
         rows.append(

From 7618b0b5ffdd204743da4257dfcf80f423e31157 Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Wed, 1 Apr 2026 10:11:57 -0400
Subject: [PATCH 23/33] simplify verified check

---
 .../ci_run_report.html.jinja                  |  3 ---
 .../create_workflow_report.py                 | 21 ++++++-------------
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/.github/actions/create_workflow_report/ci_run_report.html.jinja b/.github/actions/create_workflow_report/ci_run_report.html.jinja
index a6e0df17c49f..eba0a862561f 100644
--- a/.github/actions/create_workflow_report/ci_run_report.html.jinja
+++ b/.github/actions/create_workflow_report/ci_run_report.html.jinja
@@ -180,9 +180,6 @@
 
   {%- if pr_number == 0 -%}
   <h2 id="prs-in-release">PRs in Release</h2>
-  {% if prs_in_release_missing_verification %}
-  <p style="font-weight: bold;color: red;">Some PRs are missing verification.</p>
-  {% endif %}
   {{ prs_in_release_html }}
   {%- endif %}
 
diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py
index 1eddb30b83eb..8e03badf464c 100755
--- a/.github/actions/create_workflow_report/create_workflow_report.py
+++ b/.github/actions/create_workflow_report/create_workflow_report.py
@@ -33,7 +33,6 @@
 
 CVE_SEVERITY_ORDER = {"critical": 1, "high": 2, "medium": 3, "low": 4, "negligible": 5}
 
-PR_LABELS_VERIFIED = {"verified", "verified-with-issue"}
 
 def _is_clickhouse_memory_limit_error(exc: BaseException) -> bool:
     if isinstance(exc, ServerException) and getattr(exc, "code", None) == 241:
@@ -174,9 +173,9 @@ def get_run_details(run_id: str) -> dict:
     return response.json()
 
 
-def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.DataFrame, bool]:
+def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> pd.DataFrame:
     if len(df) == 0:
-        return pd.DataFrame(columns=["pr_number", "pr_name", "pr_labels"]), False
+        return pd.DataFrame(columns=["pr_number", "pr_name", "pr_labels"])
     if not GITHUB_TOKEN:
         raise Exception("GITHUB_TOKEN is required to fetch PR titles and labels")
     headers = {
@@ -184,7 +183,6 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.Da
         "Accept": "application/vnd.github.v3+json",
     }
     rows = []
-    missing_verification = False
     for pr_number in df["pr_number"].tolist():
         response = requests.get(
             f"https://api.github.com/repos/{repo}/pulls/{pr_number}",
@@ -196,8 +194,6 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.Da
             )
         pr = response.json()
         label_names = [l["name"] for l in pr.get("labels", [])]
-        if not PR_LABELS_VERIFIED.intersection(label_names):
-            missing_verification = True
         rows.append(
             {
                 "pr_number": pr_number,
@@ -205,7 +201,7 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> tuple[pd.Da
                 "pr_labels": ", ".join(sorted(label_names)),
             }
         )
-    return pd.DataFrame(rows), missing_verification
+    return pd.DataFrame(rows)
 
 
 def _git_rev_parse(ref: str, cwd: str | None) -> str | None:
@@ -337,13 +333,12 @@ def get_prs_in_release_dataframe(
     *,
     repo: str = GITHUB_REPO,
     cwd: str,
-) -> tuple[pd.DataFrame, bool]:
+) -> pd.DataFrame:
     f"""
     PRs merged into branch_ref that belong in the next release notes: after the latest GitHub
     Release tag on this history, or after the oldest rebase bootstrap if no such tag exists.
     Only merge commits whose subject has from <repo_owner>/ (e.g. from Altinity/) are included.
     Columns: pr_number, pr_name, labels. Omits PRs labeled cicd.
-    Second value is True if any listed PR lacks verified labels.
     """
     branch_sha = _git_rev_parse(branch_ref, cwd)
     if not branch_sha:
@@ -815,7 +810,7 @@ def url_to_html_link(url: str) -> str:
 def format_pr_labels_with_verification(labels: str) -> str:
     """Format the PR labels with verification."""
     labels_list = labels.split(", ")
-    if PR_LABELS_VERIFIED.intersection(labels_list):
+    if "verified" in labels_list:
         return labels
     else:
         return f'<span style="font-weight: bold; color: red;">{labels} (missing verification)</span>'
@@ -1051,7 +1046,6 @@ def create_workflow_report(
         settings={"use_numpy": True},
     )
 
-    prs_in_release_missing_verification = False
     results_dfs = {
         "prs_in_release": [],
         "job_statuses": get_commit_statuses(commit_sha),
@@ -1065,9 +1059,7 @@ def create_workflow_report(
 
     if pr_number == 0 and not mark_preview:
         try:
-            prs_df, prs_in_release_missing_verification = get_prs_in_release_dataframe(
-                branch_name, cwd=os.getcwd()
-            )
+            prs_df = get_prs_in_release_dataframe(branch_name, cwd=os.getcwd())
             results_dfs["prs_in_release"] = prs_df
         except Exception as e:
             print(f"Error in get_prs_in_release_dataframe: {e}")
@@ -1141,7 +1133,6 @@ def create_workflow_report(
         "base_sha": "" if pr_number == 0 else pr_info.get("base", {}).get("sha"),
         "date": f"{datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC",
         "is_preview": mark_preview,
-        "prs_in_release_missing_verification": prs_in_release_missing_verification,
         "counts": {
             "jobs_status": f"{sum(results_dfs['job_statuses']['job_status'].value_counts().get(x, 0) for x in ('failure', 'error'))} fail/error",
             "checks_errors": len(results_dfs["checks_errors"]),

From 8d7c262f11db1e1038c5db9a0d1df848786772f7 Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Mon, 18 May 2026 14:14:14 -0400
Subject: [PATCH 24/33] swap columns

---
 .../actions/create_workflow_report/create_workflow_report.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py
index 8e03badf464c..12adda5c8608 100755
--- a/.github/actions/create_workflow_report/create_workflow_report.py
+++ b/.github/actions/create_workflow_report/create_workflow_report.py
@@ -196,8 +196,8 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> pd.DataFram
         label_names = [l["name"] for l in pr.get("labels", [])]
         rows.append(
             {
-                "pr_number": pr_number,
                 "pr_name": pr.get("title", ""),
+                "pr_number": pr_number,
                 "pr_labels": ", ".join(sorted(label_names)),
             }
         )

From 6e748a713cbd50eb756d9482813a1e38abac4524 Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Thu, 11 Jun 2026 15:14:18 -0400
Subject: [PATCH 25/33] add tabbed navigation to report

---
 .../ci_run_report.html.jinja                  | 169 +++++++++++++-----
 1 file changed, 128 insertions(+), 41 deletions(-)

diff --git a/.github/actions/create_workflow_report/ci_run_report.html.jinja b/.github/actions/create_workflow_report/ci_run_report.html.jinja
index eba0a862561f..86cf5cbc8ba2 100644
--- a/.github/actions/create_workflow_report/ci_run_report.html.jinja
+++ b/.github/actions/create_workflow_report/ci_run_report.html.jinja
@@ -13,10 +13,11 @@
     :root {
       --altinity-background: #000D45;
       --altinity-accent: #189DCF;
-      --altinity-highlight: #FFC600;
+      --altinity-link-hover: #FFC600;
       --altinity-gray: #6c757d;
       --altinity-light-gray: #f8f9fa;
       --altinity-white: #ffffff;
+      --altinity-hover: color-mix(in srgb, var(--altinity-accent) 12%, var(--altinity-white));
     }
 
     /* Body and heading fonts */
@@ -45,7 +46,7 @@
 
     /* General table styling */
     table {
-      min-width: min(900px, 98vw);
+      min-width: min(900px, 100%);
       margin: 1rem 0;
       border-collapse: collapse;
       background-color: var(--altinity-white);
@@ -107,7 +108,7 @@
 
     /* Table body row styling */
     tr:hover {
-      background-color: var(--altinity-light-gray);
+      background-color: var(--altinity-hover);
     }
 
     /* Table cell styling */
@@ -123,9 +124,69 @@
     }
 
     a:hover {
-      color: var(--altinity-highlight);
+      color: var(--altinity-link-hover);
       text-decoration: underline;
     }
+
+    /* Tab navigation: wrapping pill strip on a white surface that matches the tables */
+    .tab {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 0.375rem;
+      padding: 0.375rem;
+      border-radius: 0.75rem;
+      width: fit-content;
+      max-width: 100%;
+      margin: 1.5rem 0;
+      background: var(--altinity-white);
+    }
+
+    .tab button.tablinks {
+      appearance: none;
+      border: 0;
+      background: transparent;
+      padding: 0.55rem 0.95rem;
+      border-radius: 0.5rem;
+      font: inherit;
+      cursor: pointer;
+      white-space: nowrap;
+      color: var(--altinity-background);
+      transition: background 0.12s, color 0.12s;
+    }
+
+    .tab button.tablinks:hover {
+      background: var(--altinity-hover);
+    }
+
+    .tab button.tablinks.active {
+      background: var(--altinity-accent);
+      color: var(--altinity-white);
+      font-weight: 600;
+    }
+
+    .tabcontent {
+      display: none;
+      padding: 0;
+      overflow-x: auto;
+      animation: fadeEffect 0.3s;
+    }
+
+    .tabcontent table {
+      min-width: 0;
+    }
+
+    .tabcontent > *:first-child {
+      margin-top: 0;
+    }
+
+    .tabcontent.active {
+      display: block;
+    }
+
+    @keyframes fadeEffect {
+      from { opacity: 0; }
+      to   { opacity: 1; }
+    }
   </style>
   <title>{{ title }}</title>
   <link rel="icon" type="image/svg+xml"
@@ -166,52 +227,78 @@
   {% if is_preview %}
   <p style="font-weight: bold;color: red;">This is a preview. The workflow is not yet finished.</p>
   {% endif %}
-  <h2>Table of Contents</h2>
-  <ul>
-    {%- if pr_number == 0 %}<li><a href="#prs-in-release">PRs in Release</a> ({{ counts.prs_in_release }})</li>{% endif %}
-    {%- if pr_number != 0 %}<li><a href="#new-fails-pr">New Fails in PR</a> ({{ counts.pr_new_fails }})</li>{% endif %}
-    <li><a href="#ci-jobs-status">CI Jobs Status</a> ({{ counts.jobs_status }})</li>
-    <li><a href="#checks-errors">Checks Errors</a> ({{ counts.checks_errors }})</li>
-    <li><a href="#checks-fails">Checks New Fails</a> ({{ counts.checks_new_fails }})</li>
-    <li><a href="#regression-fails">Regression New Fails</a> ({{ counts.regression_new_fails }})</li>
-    <li><a href="#docker-images-cves">Docker Images CVEs</a> ({{ counts.cves }})</li>
-    <li><a href="#checks-known-fails">Checks Known Fails</a> ({{ counts.checks_known_fails }})</li>
-  </ul>
-
-  {%- if pr_number == 0 -%}
-  <h2 id="prs-in-release">PRs in Release</h2>
-  {{ prs_in_release_html }}
+  <div class="tab">
+    {%- if pr_number == 0 %}<button class="tablinks" onclick="openTab(event, 'prs-in-release')">PRs in Release ({{ counts.prs_in_release }})</button>{% endif %}
+    {%- if pr_number != 0 %}<button class="tablinks" onclick="openTab(event, 'new-fails-pr')">New Fails in PR ({{ counts.pr_new_fails }})</button>{% endif %}
+    <button class="tablinks" onclick="openTab(event, 'ci-jobs-status')">CI Jobs Status ({{ counts.jobs_status }})</button>
+    <button class="tablinks" onclick="openTab(event, 'checks-errors')">Checks Errors ({{ counts.checks_errors }})</button>
+    <button class="tablinks" onclick="openTab(event, 'checks-fails')">Checks New Fails ({{ counts.checks_new_fails }})</button>
+    <button class="tablinks" onclick="openTab(event, 'regression-fails')">Regression New Fails ({{ counts.regression_new_fails }})</button>
+    <button class="tablinks" onclick="openTab(event, 'docker-images-cves')">Docker Images CVEs ({{ counts.cves }})</button>
+    <button class="tablinks" onclick="openTab(event, 'checks-known-fails')">Checks Known Fails ({{ counts.checks_known_fails }})</button>
+  </div>
+
+  {%- if pr_number == 0 %}
+  <div id="prs-in-release" class="tabcontent">
+    {{ prs_in_release_html }}
+  </div>
   {%- endif %}
 
-  {%- if pr_number != 0 -%}
-  <h2 id="new-fails-pr">New Fails in PR</h2>
-  <p> Compared with base sha {{ base_sha }} </p>
-  {{ new_fails_html }}
+  {%- if pr_number != 0 %}
+  <div id="new-fails-pr" class="tabcontent">
+    <p>Compared with base sha {{ base_sha }}</p>
+    {{ new_fails_html }}
+  </div>
   {%- endif %}
 
-  <h2 id="ci-jobs-status">CI Jobs Status</h2>
-  {{ ci_jobs_status_html }}
+  <div id="ci-jobs-status" class="tabcontent">
+    {{ ci_jobs_status_html }}
+  </div>
+
+  <div id="checks-errors" class="tabcontent">
+    {{ checks_errors_html }}
+  </div>
 
-  <h2 id="checks-errors">Checks Errors</h2>
-  {{ checks_errors_html }}
+  <div id="checks-fails" class="tabcontent">
+    {{ checks_fails_html }}
+  </div>
 
-  <h2 id="checks-fails">Checks New Fails</h2>
-  {{ checks_fails_html }}
+  <div id="regression-fails" class="tabcontent">
+    {{ regression_fails_html }}
+  </div>
 
-  <h2 id="regression-fails">Regression New Fails</h2>
-  {{ regression_fails_html }}
+  <div id="docker-images-cves" class="tabcontent">
+    {{ docker_images_cves_html }}
+  </div>
 
-  <h2 id="docker-images-cves">Docker Images CVEs</h2>
-  {{ docker_images_cves_html }}
+  <div id="checks-known-fails" class="tabcontent">
+    <p>
+      Fail reason conventions:<br/>
+      KNOWN - Accepted fail and fix is not planned<br/>
+      INVESTIGATE - We don't know why it fails<br/>
+      NEEDSFIX - Investigation done and a fix is needed to make it pass<br/>
+    </p>
+    {{ checks_known_fails_html }}
+  </div>
 
-  <h2 id="checks-known-fails">Checks Known Fails</h2>
-  <p>
-    Fail reason conventions:<br/>
-    KNOWN - Accepted fail and fix is not planned<br/>
-    INVESTIGATE - We don't know why it fails<br/>
-    NEEDSFIX - Investigation done and a fix is needed to make it pass<br/>
-  </p>
-  {{ checks_known_fails_html }}
+  <script>
+    function openTab(evt, tabId) {
+      var i, tabcontent, tablinks;
+      tabcontent = document.getElementsByClassName('tabcontent');
+      for (i = 0; i < tabcontent.length; i++) {
+        tabcontent[i].classList.remove('active');
+      }
+      tablinks = document.getElementsByClassName('tablinks');
+      for (i = 0; i < tablinks.length; i++) {
+        tablinks[i].classList.remove('active');
+      }
+      document.getElementById(tabId).classList.add('active');
+      evt.currentTarget.classList.add('active');
+    }
+    // Open CI Jobs Status by default
+    var defaultTab = document.querySelector('.tab button[onclick*="ci-jobs-status"]');
+    if (defaultTab) defaultTab.click();
+  </script>
 
   <script>
     document.addEventListener('click', function (e) {

From a4f0276f66e54c31f2f040846b681f7108037e85 Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Mon, 15 Jun 2026 14:06:16 -0400
Subject: [PATCH 26/33] support # in url with tabs, improve tab button
 visibility

---
 .../ci_run_report.html.jinja                      | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/.github/actions/create_workflow_report/ci_run_report.html.jinja b/.github/actions/create_workflow_report/ci_run_report.html.jinja
index 86cf5cbc8ba2..3d908e2484eb 100644
--- a/.github/actions/create_workflow_report/ci_run_report.html.jinja
+++ b/.github/actions/create_workflow_report/ci_run_report.html.jinja
@@ -132,8 +132,8 @@
     .tab {
       display: flex;
       flex-wrap: wrap;
-      gap: 0.375rem;
-      padding: 0.375rem;
+      gap: 0.4rem;
+      padding: 0.4rem;
       border-radius: 0.75rem;
       width: fit-content;
       max-width: 100%;
@@ -143,7 +143,7 @@
 
     .tab button.tablinks {
       appearance: none;
-      border: 0;
+      border: 1px solid color-mix(in srgb, var(--altinity-accent) 60%, var(--altinity-white));
       background: transparent;
       padding: 0.55rem 0.95rem;
       border-radius: 0.5rem;
@@ -294,10 +294,13 @@
       }
       document.getElementById(tabId).classList.add('active');
       evt.currentTarget.classList.add('active');
+      if (evt.isTrusted) history.replaceState(null, '', '#' + tabId);
     }
-    // Open CI Jobs Status by default
-    var defaultTab = document.querySelector('.tab button[onclick*="ci-jobs-status"]');
-    if (defaultTab) defaultTab.click();
+    // Open the tab named in the URL hash, or fall back to CI Jobs Status
+    var hash = window.location.hash.slice(1);
+    var startTab = (hash && document.querySelector('.tab button[onclick*="' + hash + '"]'))
+      || document.querySelector('.tab button[onclick*="ci-jobs-status"]');
+    if (startTab) startTab.click();
   </script>
 
   <script>

From 46c795611863b2f9b529234f5abbcba03e84f0d8 Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Tue, 16 Jun 2026 14:07:12 -0400
Subject: [PATCH 27/33] switch styling from pill to tab

---
 .../ci_run_report.html.jinja                      | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/.github/actions/create_workflow_report/ci_run_report.html.jinja b/.github/actions/create_workflow_report/ci_run_report.html.jinja
index 3d908e2484eb..92af0fab1cf0 100644
--- a/.github/actions/create_workflow_report/ci_run_report.html.jinja
+++ b/.github/actions/create_workflow_report/ci_run_report.html.jinja
@@ -128,7 +128,7 @@
       text-decoration: underline;
     }
 
-    /* Tab navigation: wrapping pill strip on a white surface that matches the tables */
+    /* Tab navigation: wrapping strip; each button is an open-bottom tab shape */
     .tab {
       display: flex;
       flex-wrap: wrap;
@@ -143,10 +143,12 @@
 
     .tab button.tablinks {
       appearance: none;
-      border: 1px solid color-mix(in srgb, var(--altinity-accent) 60%, var(--altinity-white));
-      background: transparent;
+      box-sizing: border-box;
+      border: 1px solid var(--altinity-accent);
+      border-bottom: 0;
+      border-radius: 0.5rem 0.5rem 0 0;
+      background: var(--altinity-white);
       padding: 0.55rem 0.95rem;
-      border-radius: 0.5rem;
       font: inherit;
       cursor: pointer;
       white-space: nowrap;
@@ -159,8 +161,9 @@
     }
 
     .tab button.tablinks.active {
-      background: var(--altinity-accent);
-      color: var(--altinity-white);
+      background: var(--altinity-hover);
+      box-shadow: inset 0 2px 0 var(--altinity-accent);
+      color: var(--altinity-accent);
       font-weight: 600;
     }
 

From 83f75c43cda60a762e9b3acf10348b1675542742 Mon Sep 17 00:00:00 2001
From: CarlosFelipeOR <carlosfelipeor@gmail.com>
Date: Mon, 22 Jun 2026 22:11:03 -0300
Subject: [PATCH 28/33] Escape PR title and labels in PRs in Release table to
 prevent HTML injection

Signed-off-by: CarlosFelipeOR <carlosfelipeor@gmail.com>
---
 .../create_workflow_report.py                    | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py
index 12adda5c8608..2aadba55febd 100755
--- a/.github/actions/create_workflow_report/create_workflow_report.py
+++ b/.github/actions/create_workflow_report/create_workflow_report.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import argparse
+import html
 import os
 import sys
 import time
@@ -194,11 +195,13 @@ def _enrich_prs_in_release_merge_prs(df: pd.DataFrame, repo: str) -> pd.DataFram
             )
         pr = response.json()
         label_names = [l["name"] for l in pr.get("labels", [])]
+        # Escape at collection time so untrusted PR data cannot inject HTML
+        # regardless of how columns are later rendered.
         rows.append(
             {
-                "pr_name": pr.get("title", ""),
+                "pr_name": html.escape(str(pr.get("title", "")), quote=True),
                 "pr_number": pr_number,
-                "pr_labels": ", ".join(sorted(label_names)),
+                "pr_labels": html.escape(", ".join(sorted(label_names)), quote=True),
             }
         )
     return pd.DataFrame(rows)
@@ -808,7 +811,11 @@ def url_to_html_link(url: str) -> str:
 
 
 def format_pr_labels_with_verification(labels: str) -> str:
-    """Format the PR labels with verification."""
+    """Format the PR labels with verification.
+
+    Expects ``labels`` to be already HTML-escaped at collection time
+    (see _enrich_prs_in_release_merge_prs).
+    """
     labels_list = labels.split(", ")
     if "verified" in labels_list:
         return labels
@@ -849,6 +856,9 @@ def format_col_name(col_name: str) -> str:
 
     results.columns = [format_col_name(col) for col in results.columns]
 
+    # NOTE: to_html is called with escape=False, so any new column carrying
+    # untrusted text must either be HTML-escaped at collection time or have an
+    # explicit formatter below that escapes it.
     formatters = {
         "Results Link": url_to_html_link,
         "Test Name": format_test_name_for_linewrap,

From 3e92ac522bf01266082f3ad689df8ab788396836 Mon Sep 17 00:00:00 2001
From: CarlosFelipeOR <carlosfelipeor@gmail.com>
Date: Tue, 23 Jun 2026 02:02:02 -0300
Subject: [PATCH 29/33] Fix Grype Scan: pin boto3==1.43.33 to avoid botocore
 runtime ImportError (#1931)

Signed-off-by: CarlosFelipeOR <carlosfelipeor@gmail.com>
---
 .github/workflows/grype_scan.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/grype_scan.yml b/.github/workflows/grype_scan.yml
index ed9edeefe4b6..b6ac8e923bcd 100644
--- a/.github/workflows/grype_scan.yml
+++ b/.github/workflows/grype_scan.yml
@@ -50,7 +50,7 @@ jobs:
             sudo apt-get install -y python3-pip python3-venv
             python3 -m venv venv
             source venv/bin/activate
-            pip install --upgrade requests chardet urllib3 unidiff boto3 PyGithub
+            pip install --upgrade requests chardet urllib3 unidiff 'boto3==1.43.33' PyGithub
             pip install testflows==$TESTFLOWS_VERSION awscli==1.33.28
             echo PATH=$PATH >>$GITHUB_ENV
 

From 9b79223e1ca1ce6df348c5b7a323adeddc558444 Mon Sep 17 00:00:00 2001
From: strtgbb <146047128+strtgbb@users.noreply.github.com>
Date: Fri, 26 Jun 2026 09:23:49 -0400
Subject: [PATCH 30/33] fix regression not respecting arm skip flag

---
 .github/workflows/master.yml             | 2 +-
 .github/workflows/pull_request.yml       | 2 +-
 ci/praktika/yaml_additional_templates.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index 5331b02c2295..df06658a6c25 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -5948,7 +5948,7 @@ jobs:
       workflow_config: ${{ needs.config_workflow.outputs.data }}
   RegressionTestsAarch64:
     needs: [config_workflow, build_arm_binary, stateless_tests_arm_binary_parallel]
-    if: ${{  !cancelled() && !contains(needs.*.outputs.pipeline_status, 'failure') && !contains(fromJson(needs.config_workflow.outputs.data).JOB_KV_DATA.ci_exclude_tags, 'regression') && !contains(fromJson(needs.config_workflow.outputs.data).workflow_config.custom_data.ci_exclude_tags, 'aarch64')}}
+    if: ${{  !cancelled() && !contains(needs.*.outputs.pipeline_status, 'failure') && !contains(fromJson(needs.config_workflow.outputs.data).JOB_KV_DATA.ci_exclude_tags, 'regression') && !contains(fromJson(needs.config_workflow.outputs.data).JOB_KV_DATA.ci_exclude_tags, 'aarch64')}}
     uses: ./.github/workflows/regression.yml
     secrets: inherit
     with:
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 3d3d36019d2c..506f2576f8f2 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -5527,7 +5527,7 @@ jobs:
       workflow_config: ${{ needs.config_workflow.outputs.data }}
   RegressionTestsAarch64:
     needs: [config_workflow, build_arm_binary, stateless_tests_arm_binary_parallel]
-    if: ${{  !cancelled() && !contains(needs.*.outputs.pipeline_status, 'failure') && !contains(fromJson(needs.config_workflow.outputs.data).JOB_KV_DATA.ci_exclude_tags, 'regression') && !contains(fromJson(needs.config_workflow.outputs.data).workflow_config.custom_data.ci_exclude_tags, 'aarch64')}}
+    if: ${{  !cancelled() && !contains(needs.*.outputs.pipeline_status, 'failure') && !contains(fromJson(needs.config_workflow.outputs.data).JOB_KV_DATA.ci_exclude_tags, 'regression') && !contains(fromJson(needs.config_workflow.outputs.data).JOB_KV_DATA.ci_exclude_tags, 'aarch64')}}
     uses: ./.github/workflows/regression.yml
     secrets: inherit
     with:
diff --git a/ci/praktika/yaml_additional_templates.py b/ci/praktika/yaml_additional_templates.py
index b4f414eee704..67e35a36c3fa 100644
--- a/ci/praktika/yaml_additional_templates.py
+++ b/ci/praktika/yaml_additional_templates.py
@@ -75,7 +75,7 @@ class AltinityWorkflowTemplates:
       workflow_config: ${{ needs.config_workflow.outputs.data }}
   RegressionTestsAarch64:
     needs: [config_workflow, build_arm_binary, stateless_tests_arm_binary_parallel]
-    if: ${{  !cancelled() && !contains(needs.*.outputs.pipeline_status, 'failure') && !contains(fromJson(needs.config_workflow.outputs.data).JOB_KV_DATA.ci_exclude_tags, 'regression') && !contains(fromJson(needs.config_workflow.outputs.data).workflow_config.custom_data.ci_exclude_tags, 'aarch64')}}
+    if: ${{  !cancelled() && !contains(needs.*.outputs.pipeline_status, 'failure') && !contains(fromJson(needs.config_workflow.outputs.data).JOB_KV_DATA.ci_exclude_tags, 'regression') && !contains(fromJson(needs.config_workflow.outputs.data).JOB_KV_DATA.ci_exclude_tags, 'aarch64')}}
     uses: ./.github/workflows/regression.yml
     secrets: inherit
     with:

From 46c8d3203047798a02495e033bc5fcaac55af4b6 Mon Sep 17 00:00:00 2001
From: Andrey Zvonov <32552679+zvonand@users.noreply.github.com>
Date: Thu, 18 Jun 2026 13:45:11 +0200
Subject: [PATCH 31/33] cache vended catalog credentials

---
 docs/en/engines/database-engines/datalake.md  |   1 +
 src/Common/ProfileEvents.cpp                  |   2 +
 src/Databases/DataLake/DatabaseDataLake.cpp   |   6 +
 .../DataLake/DatabaseDataLakeSettings.cpp     |   1 +
 src/Databases/DataLake/ICatalog.h             |   3 +
 src/Databases/DataLake/RestCatalog.cpp        | 172 ++++++++++++++++--
 src/Databases/DataLake/RestCatalog.h          |  33 +++-
 src/Databases/DataLake/StorageCredentials.h   |   7 +-
 8 files changed, 204 insertions(+), 21 deletions(-)

diff --git a/docs/en/engines/database-engines/datalake.md b/docs/en/engines/database-engines/datalake.md
index 9c20511172b3..f90294b87bd4 100644
--- a/docs/en/engines/database-engines/datalake.md
+++ b/docs/en/engines/database-engines/datalake.md
@@ -56,6 +56,7 @@ The following settings are supported:
 | `storage_endpoint`      | Endpoint URL for the underlying storage                                                       |
 | `oauth_server_uri`      | URI of the OAuth2 authorization server for authentication                                     |
 | `vended_credentials`    | Boolean indicating whether to use vended credentials from the catalog (supports AWS S3 and Azure ADLS Gen2) |
+| `vended_credentials_cache_ttl` | Maximum cache entry lifetime (in seconds) for vended credentials (REST catalogs only). Default `300`; `0` disables caching. |
 | `aws_access_key_id`     | AWS access key ID for S3/Glue access (if not using vended credentials)                        |
 | `aws_secret_access_key` | AWS secret access key for S3/Glue access (if not using vended credentials)                    |
 | `region`                | AWS region for the service (e.g., `us-east-1`)                                                |
diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 993e9d6a85cd..0828ee9dc92f 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -1429,6 +1429,8 @@ The server successfully detected this situation and will download merged part fr
     M(DataLakeRestCatalogGetTableMetadataMicroseconds, "Total time of 'get table metadata' requests to Iceberg REST catalog.", ValueType::Microseconds) \
     M(DataLakeRestCatalogGetCredentials, "Number of 'get credentials' requests to Iceberg REST catalog.", ValueType::Number) \
     M(DataLakeRestCatalogGetCredentialsMicroseconds, "Total time of 'get credentials' requests to Iceberg REST catalog.", ValueType::Microseconds) \
+    M(DataLakeRestCatalogCredentialsVended, "Number of table metadata requests to Iceberg REST catalog that asked the catalog to vend storage credentials (i.e. cache miss).", ValueType::Number) \
+    M(DataLakeRestCatalogCredentialsCacheHits, "Number of table metadata requests to Iceberg REST catalog that reused cached storage credentials and did not ask the catalog to vend new ones.", ValueType::Number) \
     M(DataLakeRestCatalogCreateNamespace, "Number of 'create namespace' requests to Iceberg REST catalog.", ValueType::Number) \
     M(DataLakeRestCatalogCreateNamespaceMicroseconds, "Total time of 'create namespace' requests to Iceberg REST catalog.", ValueType::Microseconds) \
     M(DataLakeRestCatalogCreateTable, "Number of 'create table' requests to Iceberg REST catalog.", ValueType::Number) \
diff --git a/src/Databases/DataLake/DatabaseDataLake.cpp b/src/Databases/DataLake/DatabaseDataLake.cpp
index 112c07809a03..975806c6c3ca 100644
--- a/src/Databases/DataLake/DatabaseDataLake.cpp
+++ b/src/Databases/DataLake/DatabaseDataLake.cpp
@@ -1,5 +1,6 @@
 #include <algorithm>
 #include <array>
+#include <chrono>
 #include <memory>
 #include <Databases/DataLake/DatabaseDataLake.h>
 #include <Core/SettingsEnums.h>
@@ -65,6 +66,7 @@ namespace DatabaseDataLakeSetting
     extern const DatabaseDataLakeSettingsString oauth_server_uri;
     extern const DatabaseDataLakeSettingsBool oauth_server_use_request_body;
     extern const DatabaseDataLakeSettingsBool vended_credentials;
+    extern const DatabaseDataLakeSettingsUInt64 vended_credentials_cache_ttl;
     extern const DatabaseDataLakeSettingsString object_storage_cluster;
     extern const DatabaseDataLakeSettingsString aws_access_key_id;
     extern const DatabaseDataLakeSettingsString aws_secret_access_key;
@@ -335,6 +337,10 @@ std::shared_ptr<DataLake::ICatalog> DatabaseDataLake::getCatalog() const
         }
     }
 
+    if (catalog_impl)
+        catalog_impl->setVendedCredentialsCacheTTL(
+            std::chrono::seconds(settings[DatabaseDataLakeSetting::vended_credentials_cache_ttl].value));
+
     return catalog_impl;
 }
 
diff --git a/src/Databases/DataLake/DatabaseDataLakeSettings.cpp b/src/Databases/DataLake/DatabaseDataLakeSettings.cpp
index c65e0f18cea7..f12268490840 100644
--- a/src/Databases/DataLake/DatabaseDataLakeSettings.cpp
+++ b/src/Databases/DataLake/DatabaseDataLakeSettings.cpp
@@ -20,6 +20,7 @@ namespace ErrorCodes
     DECLARE(DatabaseDataLakeCatalogType, catalog_type, DatabaseDataLakeCatalogType::NONE, "Catalog type", 0) \
     DECLARE(String, catalog_credential, "", "", 0) \
     DECLARE(Bool, vended_credentials, true, "Use vended credentials (storage credentials) from catalog", 0) \
+    DECLARE(UInt64, vended_credentials_cache_ttl, 300, "Maximum cache entry lifetime (in seconds) for vended credentials. '0' disables caching.", 0) \
     DECLARE(String, auth_scope, "PRINCIPAL_ROLE:ALL", "Authorization scope for client credentials or token exchange", 0) \
     DECLARE(String, oauth_server_uri, "", "OAuth server uri", 0) \
     DECLARE(Bool, oauth_server_use_request_body, true, "Put parameters into request body or query params", 0) \
diff --git a/src/Databases/DataLake/ICatalog.h b/src/Databases/DataLake/ICatalog.h
index 9ccb420ac759..cbd700fbabcc 100644
--- a/src/Databases/DataLake/ICatalog.h
+++ b/src/Databases/DataLake/ICatalog.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <chrono>
 #include <optional>
 #include <Core/Types.h>
 #include <Core/NamesAndTypes.h>
@@ -219,6 +220,8 @@ class ICatalog
         return std::nullopt;
     }
 
+    virtual void setVendedCredentialsCacheTTL(std::chrono::seconds /*ttl*/) {}
+
 protected:
     /// Name of the warehouse,
     /// which is sometimes also called "catalog name".
diff --git a/src/Databases/DataLake/RestCatalog.cpp b/src/Databases/DataLake/RestCatalog.cpp
index 60e6dbc59c12..e030c873a152 100644
--- a/src/Databases/DataLake/RestCatalog.cpp
+++ b/src/Databases/DataLake/RestCatalog.cpp
@@ -44,6 +44,11 @@
 #include <Poco/Net/HTTPSClientSession.h>
 #include <Poco/Net/SSLManager.h>
 #include <Poco/StreamCopier.h>
+#include <Poco/DateTime.h>
+#include <Poco/DateTimeFormat.h>
+#include <Poco/DateTimeParser.h>
+#include <Poco/StringTokenizer.h>
+#include <Poco/Timestamp.h>
 
 
 namespace DB::ErrorCodes
@@ -66,6 +71,8 @@ namespace ProfileEvents
     extern const Event DataLakeRestCatalogGetTableMetadataMicroseconds;
     extern const Event DataLakeRestCatalogGetCredentials;
     extern const Event DataLakeRestCatalogGetCredentialsMicroseconds;
+    extern const Event DataLakeRestCatalogCredentialsVended;
+    extern const Event DataLakeRestCatalogCredentialsCacheHits;
     extern const Event DataLakeRestCatalogCreateNamespace;
     extern const Event DataLakeRestCatalogCreateNamespaceMicroseconds;
     extern const Event DataLakeRestCatalogCreateTable;
@@ -928,15 +935,28 @@ bool RestCatalog::getTableMetadataImpl(
             "Namespace {} is filtered by `namespaces` database parameter", namespace_name);
 
     DB::HTTPHeaderEntries headers;
-    if (result.requiresCredentials())
+
+    const bool want_credentials = result.requiresCredentials();
+
+    /// Reuse previously vended credentials is possible
+    std::optional<VendedStorageCredentials> cached_credentials;
+    if (want_credentials)
     {
+        cached_credentials = tryGetCachedCredentials(namespace_name, table_name);
+
         /// Header `X-Iceberg-Access-Delegation` tells catalog to include storage credentials in LoadTableResponse.
         /// Value can be one of the two:
         /// 1. `vended-credentials`
         /// 2. `remote-signing`
         /// Currently we support only the first.
         /// https://github.com/apache/iceberg/blob/3badfe0c1fcf0c0adfc7aa4a10f0b50365c48cf9/open-api/rest-catalog-open-api.yaml#L1832
-        headers.emplace_back("X-Iceberg-Access-Delegation", "vended-credentials");
+        if (cached_credentials)
+            ProfileEvents::increment(ProfileEvents::DataLakeRestCatalogCredentialsCacheHits);
+        else
+        {
+            ProfileEvents::increment(ProfileEvents::DataLakeRestCatalogCredentialsVended);
+            headers.emplace_back("X-Iceberg-Access-Delegation", "vended-credentials");
+        }
     }
 
     const std::string endpoint = std::filesystem::path(NAMESPACES_ENDPOINT) / encodeNamespaceForURI(namespace_name) / "tables" / table_name;
@@ -994,16 +1014,28 @@ bool RestCatalog::getTableMetadataImpl(
         result.setSchema(*schema);
     }
 
-    if (result.isDefaultReadableTable() && result.requiresCredentials() && object->has("config"))
+    if (want_credentials && result.isDefaultReadableTable())
     {
-        auto config_object = object->get("config").extract<Poco::JSON::Object::Ptr>();
-        if (!config_object)
-            throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Cannot parse config result");
-        auto [parsed_credentials, parsed_endpoint] = getCredentialsAndEndpoint(config_object, location);
-        if (parsed_credentials)
-            result.setStorageCredentials(parsed_credentials);
-        if (!parsed_endpoint.empty())
-            result.setEndpoint(parsed_endpoint);
+        if (cached_credentials)
+        {
+            result.setStorageCredentials(cached_credentials->credentials);
+            if (!cached_credentials->endpoint.empty())
+                result.setEndpoint(cached_credentials->endpoint);
+        }
+        else if (object->has("config"))
+        {
+            auto config_object = object->get("config").extract<Poco::JSON::Object::Ptr>();
+            if (!config_object)
+                throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Cannot parse config result");
+            auto parsed = getCredentialsAndEndpoint(config_object, location);
+            if (parsed.credentials)
+            {
+                result.setStorageCredentials(parsed.credentials);
+                cacheCredentials(namespace_name, table_name, parsed);
+            }
+            if (!parsed.endpoint.empty())
+                result.setEndpoint(parsed.endpoint);
+        }
     }
 
     if (result.requiresDataLakeSpecificProperties())
@@ -1229,7 +1261,44 @@ void RestCatalog::dropTable(const String & namespace_name, const String & table_
     }
 }
 
-std::pair<std::shared_ptr<IStorageCredentials>, String> RestCatalog::getCredentialsAndEndpoint(Poco::JSON::Object::Ptr object, const String & location) const
+namespace
+{
+std::optional<std::chrono::system_clock::time_point> parseSasTokenExpiry(const std::string & sas_token)
+{
+    std::string token = sas_token;
+    if (!token.empty() && token.front() == '?')
+        token.erase(0, 1);
+
+    Poco::StringTokenizer params(token, "&", Poco::StringTokenizer::TOK_IGNORE_EMPTY | Poco::StringTokenizer::TOK_TRIM);
+    for (const auto & param : params)
+    {
+        if (!param.starts_with("se="))
+            continue;
+
+        try
+        {
+            std::string decoded;
+            Poco::URI::decode(param.substr(3), decoded);
+
+            int time_zone_differential = 0;
+            Poco::DateTime date_time;
+            if (Poco::DateTimeParser::tryParse(Poco::DateTimeFormat::ISO8601_FORMAT, decoded, date_time, time_zone_differential))
+            {
+                date_time.makeUTC(time_zone_differential);
+                return std::chrono::system_clock::from_time_t(date_time.timestamp().epochTime());
+            }
+        }
+        catch (...)
+        {
+            return std::nullopt;
+        }
+        return std::nullopt;
+    }
+    return std::nullopt;
+}
+}
+
+VendedStorageCredentials RestCatalog::getCredentialsAndEndpoint(Poco::JSON::Object::Ptr object, const String & location) const
 {
     auto storage_type = parseStorageTypeFromLocation(location);
     switch (storage_type)
@@ -1240,11 +1309,13 @@ std::pair<std::shared_ptr<IStorageCredentials>, String> RestCatalog::getCredenti
             static constexpr auto secret_access_key_str = "s3.secret-access-key";
             static constexpr auto session_token_str = "s3.session-token";
             static constexpr auto storage_endpoint_str = "s3.endpoint";
+            static constexpr auto session_token_expires_at_ms_str = "s3.session-token-expires-at-ms";
 
             std::string access_key_id;
             std::string secret_access_key;
             std::string session_token;
             std::string storage_endpoint;
+            std::optional<std::chrono::system_clock::time_point> expires_at;
             if (object->has(access_key_id_str))
                 access_key_id = object->get(access_key_id_str).extract<String>();
             if (object->has(secret_access_key_str))
@@ -1253,9 +1324,26 @@ std::pair<std::shared_ptr<IStorageCredentials>, String> RestCatalog::getCredenti
                 session_token = object->get(session_token_str).extract<String>();
             if (object->has(storage_endpoint_str))
                 storage_endpoint = object->get(storage_endpoint_str).extract<String>();
+            if (object->has(session_token_expires_at_ms_str))
+            {
+                try
+                {
+                    static constexpr Int64 max_representable_sec
+                        = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::system_clock::duration::max()).count();
+                    const Int64 expires_at_ms = object->get(session_token_expires_at_ms_str).convert<Int64>();
+                    if (expires_at_ms <= 0)
+                        expires_at = std::chrono::system_clock::time_point{}; /// Already invalid: do not cache.
+                    else if (expires_at_ms / 1000 < max_representable_sec)
+                        expires_at = std::chrono::system_clock::from_time_t(static_cast<std::time_t>(expires_at_ms / 1000));
+                }
+                catch (...)
+                {
+                    LOG_DEBUG(log, "Failed to parse '{}' from vended credentials config", session_token_expires_at_ms_str);
+                }
+            }
 
             LOG_DEBUG(log, "get tokens for location {}", location);
-            return {std::make_shared<S3Credentials>(access_key_id, secret_access_key, session_token), storage_endpoint};
+            return {std::make_shared<S3Credentials>(access_key_id, secret_access_key, session_token), storage_endpoint, expires_at};
         }
         case StorageType::Azure:
         {
@@ -1277,15 +1365,59 @@ std::pair<std::shared_ptr<IStorageCredentials>, String> RestCatalog::getCredenti
             }
 
             if (!sas_token.empty())
-            {
-                return {std::make_shared<AzureCredentials>(sas_token), ""};
-            }
+                return {std::make_shared<AzureCredentials>(sas_token), "", parseSasTokenExpiry(sas_token)};
             break;
         }
         default:
             break;
     }
-    return {nullptr, ""};
+    return {nullptr, "", std::nullopt};
+}
+
+std::optional<VendedStorageCredentials> RestCatalog::tryGetCachedCredentials(
+    const std::string & namespace_name, const std::string & table_name) const
+{
+    if (vended_credentials_cache_ttl.load(std::memory_order_relaxed) <= std::chrono::seconds::zero())
+        return std::nullopt;
+
+    std::lock_guard lock(credentials_cache_mutex);
+    auto it = credentials_cache.find({namespace_name, table_name});
+    if (it == credentials_cache.end())
+        return std::nullopt;
+    if (std::chrono::system_clock::now() >= it->second.expires_at.value())
+    {
+        credentials_cache.erase(it); /// Drop the stale entry.
+        return std::nullopt;
+    }
+
+    return it->second;
+}
+
+void RestCatalog::cacheCredentials(
+    const std::string & namespace_name, const std::string & table_name, const VendedStorageCredentials & parsed) const
+{
+    const auto ttl = vended_credentials_cache_ttl.load(std::memory_order_relaxed);
+    if (ttl <= std::chrono::seconds::zero())
+        return;
+
+    if (!parsed.credentials || parsed.credentials->isEmpty())
+        return;
+
+    const auto now = std::chrono::system_clock::now();
+
+    /// Cap at the configured TTL so an entry never outlives the documented maximum lifetime.
+    auto refresh_after = now + ttl;
+    if (parsed.expires_at && parsed.expires_at.value() < refresh_after)
+        refresh_after = parsed.expires_at.value();
+    if (refresh_after <= now)
+        return;
+
+    std::lock_guard lock(credentials_cache_mutex);
+
+    if (credentials_cache.size() >= credentials_cache_cleanup_threshold)
+        std::erase_if(credentials_cache, [&now](const auto & entry) { return now >= entry.second.expires_at.value(); });
+    credentials_cache[{namespace_name, table_name}]
+        = VendedStorageCredentials{parsed.credentials, parsed.endpoint, refresh_after};
 }
 
 ICatalog::CredentialsRefreshCallback RestCatalog::getCredentialsConfigurationCallback(const DB::StorageID & storage_id)
@@ -1344,8 +1476,10 @@ ICatalog::CredentialsRefreshCallback RestCatalog::getCredentialsConfigurationCal
             throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Cannot read table {}, because no 'metadata-location' in response", table_name);
         }
 
-        auto [new_credentials, _] = getCredentialsAndEndpoint(config_object, location);
-        return new_credentials;
+        auto parsed = getCredentialsAndEndpoint(config_object, location);
+        /// Refresh the per-table cache so subsequent queries reuse these freshly vended credentials.
+        cacheCredentials(namespace_name, table_name, parsed);
+        return parsed.credentials;
     };
 }
 
diff --git a/src/Databases/DataLake/RestCatalog.h b/src/Databases/DataLake/RestCatalog.h
index 49fe684e0eaa..9184e93e4530 100644
--- a/src/Databases/DataLake/RestCatalog.h
+++ b/src/Databases/DataLake/RestCatalog.h
@@ -7,7 +7,13 @@
 #include <IO/ReadWriteBufferFromHTTP.h>
 #include <IO/HTTPHeaderEntries.h>
 #include <Interpreters/Context_fwd.h>
+#include <base/defines.h>
+#include <atomic>
+#include <chrono>
 #include <filesystem>
+#include <map>
+#include <mutex>
+#include <optional>
 #include <Poco/JSON/Object.h>
 
 namespace DB
@@ -31,6 +37,13 @@ struct AccessToken
     }
 };
 
+struct VendedStorageCredentials
+{
+    std::shared_ptr<IStorageCredentials> credentials;
+    std::string endpoint;
+    std::optional<std::chrono::system_clock::time_point> expires_at;
+};
+
 class RestCatalog : public ICatalog, public DB::WithContext
 {
 public:
@@ -82,6 +95,8 @@ class RestCatalog : public ICatalog, public DB::WithContext
 
     ICatalog::CredentialsRefreshCallback getCredentialsConfigurationCallback(const DB::StorageID & storage_id) override;
 
+    void setVendedCredentialsCacheTTL(std::chrono::seconds ttl) override { vended_credentials_cache_ttl.store(ttl, std::memory_order_relaxed); }
+
     String getClientId() const { return client_id; }
     String getClientSecret() const { return client_secret; }
 
@@ -127,6 +142,16 @@ class RestCatalog : public ICatalog, public DB::WithContext
     bool oauth_server_use_request_body;
     mutable std::optional<AccessToken> access_token;
 
+    /// TTL for caching vended credentials per table (0 means no caching).
+    std::atomic<std::chrono::seconds> vended_credentials_cache_ttl{std::chrono::seconds::zero()};
+
+    /// Sweep trigger threshold, not capacity!
+    static constexpr size_t credentials_cache_cleanup_threshold = 1000;
+    mutable std::mutex credentials_cache_mutex;
+
+    mutable std::map<std::pair<std::string, std::string>, VendedStorageCredentials> credentials_cache
+        TSA_GUARDED_BY(credentials_cache_mutex);
+
 public:
     class AllowedNamespaces
     {
@@ -194,7 +219,13 @@ class RestCatalog : public ICatalog, public DB::WithContext
         const String & method = Poco::Net::HTTPRequest::HTTP_POST,
         bool ignore_result = false) const;
 
-    std::pair<std::shared_ptr<IStorageCredentials>, String> getCredentialsAndEndpoint(Poco::JSON::Object::Ptr object, const String & location) const;
+    VendedStorageCredentials getCredentialsAndEndpoint(Poco::JSON::Object::Ptr object, const String & location) const;
+
+    std::optional<VendedStorageCredentials> tryGetCachedCredentials(
+        const std::string & namespace_name, const std::string & table_name) const;
+
+    void cacheCredentials(
+        const std::string & namespace_name, const std::string & table_name, const VendedStorageCredentials & parsed) const;
 
     AccessToken retrieveAccessToken() const;
 };
diff --git a/src/Databases/DataLake/StorageCredentials.h b/src/Databases/DataLake/StorageCredentials.h
index 8d3d75d029e2..2e500e16885a 100644
--- a/src/Databases/DataLake/StorageCredentials.h
+++ b/src/Databases/DataLake/StorageCredentials.h
@@ -17,6 +17,9 @@ class IStorageCredentials
     virtual ~IStorageCredentials() = default;
 
     virtual void addCredentialsToEngineArgs(DB::ASTs & engine_args) const = 0;
+
+    /// True when the credentials are unusable (mandatory fields empty); such credentials are not cached.
+    virtual bool isEmpty() const = 0;
 };
 
 class S3Credentials final : public IStorageCredentials
@@ -31,7 +34,7 @@ class S3Credentials final : public IStorageCredentials
         , session_token(session_token_)
     {}
 
-    bool isEmpty() const { return access_key_id.empty() || secret_access_key.empty(); }
+    bool isEmpty() const override { return access_key_id.empty() || secret_access_key.empty(); }
 
     void addCredentialsToEngineArgs(DB::ASTs & engine_args) const override
     {
@@ -81,6 +84,8 @@ class AzureCredentials final : public IStorageCredentials
         engine_args.push_back(DB::make_intrusive<DB::ASTLiteral>(sas_token));
     }
 
+    bool isEmpty() const override { return sas_token.empty(); }
+
 private:
     std::string sas_token;
 };

From c10599bc33ad883d957832cc9cd91139420d2e4f Mon Sep 17 00:00:00 2001
From: Andrey Zvonov <32552679+zvonand@users.noreply.github.com>
Date: Thu, 18 Jun 2026 18:06:43 +0200
Subject: [PATCH 32/33] add test

---
 .../test.py                                   | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/tests/integration/test_database_iceberg_lakekeeper_catalog/test.py b/tests/integration/test_database_iceberg_lakekeeper_catalog/test.py
index 9a9cea39fd09..90a71b7ced26 100644
--- a/tests/integration/test_database_iceberg_lakekeeper_catalog/test.py
+++ b/tests/integration/test_database_iceberg_lakekeeper_catalog/test.py
@@ -12,6 +12,7 @@
 from pyiceberg.schema import Schema
 from pyiceberg.types import (
     DoubleType,
+    IntegerType,
     NestedField,
     StringType,
 )
@@ -384,3 +385,76 @@ def test_invalid_auth_header_format(started_cluster):
         )
     assert "Invalid auth header format" in str(err.value)
 
+
+def get_credentials_profile_events(node, query_id):
+    node.query("SYSTEM FLUSH LOGS")
+    vended = int(node.query(
+        f"SELECT ProfileEvents['DataLakeRestCatalogCredentialsVended'] "
+        f"FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
+    ))
+    hits = int(node.query(
+        f"SELECT ProfileEvents['DataLakeRestCatalogCredentialsCacheHits'] "
+        f"FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
+    ))
+    return vended, hits
+
+
+def test_vended_credentials_cache(started_cluster):
+    node = started_cluster.instances["node1"]
+    catalog = load_catalog_impl(started_cluster)
+
+    test_ref = f"test_vended_credentials_cache_{uuid.uuid4().hex[:8]}"
+    namespace = (f"{test_ref}_namespace",)
+    table_name = f"{test_ref}_table"
+    db_name = f"{test_ref}_database"
+
+    if namespace not in catalog.list_namespaces():
+        catalog.create_namespace(namespace)
+
+    schema = Schema(
+        NestedField(field_id=1, name="id", field_type=IntegerType(), required=False),
+        NestedField(field_id=2, name="data", field_type=StringType(), required=False),
+    )
+    table = catalog.create_table(
+        namespace + (table_name,),
+        schema=schema,
+        properties={"write.metadata.compression-codec": "none"},
+    )
+    table.append(
+        pa.Table.from_pandas(
+            pd.DataFrame({"id": [1], "data": ["x"]}).astype({"id": "int32"})
+        )
+    )
+
+    query = f"SELECT count() FROM {db_name}.`{namespace[0]}.{table_name}`"
+
+    # Caching enabled (default TTL): the second query reuses cached credentials
+    # and does not ask the catalog to vend them again.
+    create_clickhouse_iceberg_database(started_cluster, node, db_name)
+
+    qid = f"{test_ref}-cache-1-{uuid.uuid4()}"
+    node.query(query, query_id=qid)
+    vended, _ = get_credentials_profile_events(node, qid)
+    assert vended >= 1
+
+    qid = f"{test_ref}-cache-2-{uuid.uuid4()}"
+    node.query(query, query_id=qid)
+    vended, hits = get_credentials_profile_events(node, qid)
+    assert vended == 0 and hits >= 1
+
+    # Caching disabled (TTL = 0): every query asks the catalog to vend credentials.
+    create_clickhouse_iceberg_database(
+        started_cluster, node, db_name,
+        additional_settings={"vended_credentials_cache_ttl": 0},
+    )
+
+    qid = f"{test_ref}-nocache-1-{uuid.uuid4()}"
+    node.query(query, query_id=qid)
+    vended, hits = get_credentials_profile_events(node, qid)
+    assert vended >= 1 and hits == 0
+
+    qid = f"{test_ref}-nocache-2-{uuid.uuid4()}"
+    node.query(query, query_id=qid)
+    vended, hits = get_credentials_profile_events(node, qid)
+    assert vended >= 1 and hits == 0
+

From 6b15b03c098f4767dd46686b5433065a359a367a Mon Sep 17 00:00:00 2001
From: Daniel Q <il9ue@users.noreply.github.com>
Date: Tue, 30 Jun 2026 11:49:59 +0200
Subject: [PATCH 33/33] Address review (#1895)

move Avro schemas to helpers, drop out-of-scope stats tests, trim comments, static table-uuid, add MergeTree roundtrip
---
 .../helpers/iceberg_avro_schemas.py           |  49 +++
 ...t_iceberg_read_optimization_empty_stats.py | 342 +++---------------
 2 files changed, 93 insertions(+), 298 deletions(-)
 create mode 100644 tests/integration/helpers/iceberg_avro_schemas.py

diff --git a/tests/integration/helpers/iceberg_avro_schemas.py b/tests/integration/helpers/iceberg_avro_schemas.py
new file mode 100644
index 000000000000..6a9e72670b23
--- /dev/null
+++ b/tests/integration/helpers/iceberg_avro_schemas.py
@@ -0,0 +1,49 @@
+import json
+
+# Iceberg v2 manifest list schema (only the fields ClickHouse reads).
+MANIFEST_LIST_SCHEMA_STR = json.dumps({
+    "type": "record",
+    "name": "manifest_file",
+    "fields": [
+        {"name": "manifest_path",        "type": "string"},
+        {"name": "manifest_length",      "type": "long"},
+        {"name": "partition_spec_id",    "type": "int"},
+        {"name": "content",              "type": "int"},
+        {"name": "sequence_number",      "type": "long"},
+        {"name": "min_sequence_number",  "type": "long"},
+        {"name": "added_snapshot_id",    "type": "long"},
+        {"name": "added_files_count",    "type": "int"},
+        {"name": "existing_files_count", "type": "int"},
+        {"name": "deleted_files_count",  "type": "int"},
+        {"name": "added_rows_count",     "type": "long"},
+        {"name": "existing_rows_count",  "type": "long"},
+        {"name": "deleted_rows_count",   "type": "long"},
+    ],
+})
+
+# Stats-less manifest entry: no optional stats fields -> empty columns_info.
+MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = json.dumps({
+    "type": "record",
+    "name": "manifest_entry",
+    "fields": [
+        {"name": "status",               "type": "int"},
+        {"name": "snapshot_id",          "type": ["null", "long"]},
+        {"name": "sequence_number",      "type": ["null", "long"]},
+        {"name": "file_sequence_number", "type": ["null", "long"]},
+        {
+            "name": "data_file",
+            "type": {
+                "type": "record",
+                "name": "r2",
+                "fields": [
+                    {"name": "content",            "type": "int"},
+                    {"name": "file_path",          "type": "string"},
+                    {"name": "file_format",        "type": "string"},
+                    {"name": "partition", "type": {"type": "record", "name": "r102", "fields": []}},
+                    {"name": "record_count",       "type": "long"},
+                    {"name": "file_size_in_bytes", "type": "long"},
+                ],
+            },
+        },
+    ],
+})
diff --git a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
index fe0c8e6d085f..e1a2afe20ef5 100644
--- a/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
+++ b/tests/integration/test_storage_iceberg_no_spark/test_iceberg_read_optimization_empty_stats.py
@@ -1,157 +1,30 @@
 #!/usr/bin/env python3
 """
-Regression coverage for Altinity/ClickHouse#1545:
-the Iceberg read optimization (``allow_experimental_iceberg_read_optimization=1``)
-returned all-NULL columns when the manifest carried no per-file column statistics.
-
-Root cause
-----------
-When a manifest's optional stats fields (``value_counts``, ``column_sizes``,
-``null_value_counts``, ``lower_bounds``, ``upper_bounds``) are all absent --
-common for non-Spark writers such as pyiceberg with default settings --
-``DataFileMetaInfo::columns_info`` is left empty.  The optimization's
-absent-column loop in ``StorageObjectStorageSource::createReader`` then iterated
-every requested column, found none in the empty ``columns_info`` map, and
-injected each nullable column as a constant ``NULL``.  With every requested
-column constant, ``need_only_count`` became true and the Parquet file was read
-in count-only mode: correct row count, every value ``NULL``.
-
-The fix gates that loop on ``!columns_info.empty()``; an empty map now falls
-through to the Parquet reader, which reads physically-present columns normally
-and synthesizes schema-evolved-absent columns as ``NULL`` via
-``IcebergMetadata::getInitialSchemaByPath``.
-
-Test taxonomy
--------------
-Discriminating (fail on the unpatched build, pass on the fixed build):
-  - test_stats_less_manifest_returns_real_data
-  - test_stats_less_manifest_schema_evolution_absent_column
-Cluster/serialization path (the cluster JSON round-trip of an empty
-``columns_info``):
-  - test_stats_less_manifest_cluster_returns_real_data
-Non-regression guards (pass on both builds; the optimization's legitimate
-absent-NULL and present-column behavior must keep working when stats exist):
-  - test_non_empty_stats_absent_column_still_null
-  - test_non_empty_stats_returns_real_data
-
-Why the Iceberg fixtures are generated in code rather than checked in as static
-files: every manifest entry and the table ``metadata.json`` embed per-test
-runtime values -- the UUID-suffixed container path, the byte sizes of the
-freshly written Parquet and Avro files, and a random ``table-uuid`` -- so they
-are templates filled at run time, not the static blobs that a checked-in fixture
-(e.g. ``tests/queries/0_stateless/data_minio/.../v1.metadata.json``) would be.
-The one genuinely static part, the Avro schemas, is deduplicated into the
-module-level constants below; the stats-less manifest entry -- the artifact
-under test -- is necessarily produced in code.
+Regression coverage for Altinity/ClickHouse#1545.
 """
-
 import json
 import os
 import tempfile
 import time
-import uuid
 
 import avro.datafile
 import avro.io
 import avro.schema
 import pyarrow as pa
 import pyarrow.parquet as pq
-import pytest
 
 from helpers.iceberg_utils import get_uuid_str
 from helpers.s3_tools import LocalUploader
+from helpers.iceberg_avro_schemas import (
+    MANIFEST_LIST_SCHEMA_STR,
+    MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
+)
 
-
-# The three rows every fixture stores; ``data`` is omitted from the Parquet file
-# when a test needs a column the file physically lacks.
 _IDS = [1, 2, 3]
 _DATA = ["hello", "world", "iceberg"]
 _REAL_DATA_RESULT = "1\thello\n2\tworld\n3\ticeberg"
 _DATA_NULL_RESULT = "1\t\\N\n2\t\\N\n3\t\\N"
-
-
-# Iceberg v2 manifest list Avro schema (minimal fields needed by ClickHouse).
-_MANIFEST_LIST_SCHEMA_STR = json.dumps({
-    "type": "record",
-    "name": "manifest_file",
-    "fields": [
-        {"name": "manifest_path",        "type": "string"},
-        {"name": "manifest_length",      "type": "long"},
-        {"name": "partition_spec_id",    "type": "int"},
-        {"name": "content",              "type": "int"},
-        {"name": "sequence_number",      "type": "long"},
-        {"name": "min_sequence_number",  "type": "long"},
-        {"name": "added_snapshot_id",    "type": "long"},
-        {"name": "added_files_count",    "type": "int"},
-        {"name": "existing_files_count", "type": "int"},
-        {"name": "deleted_files_count",  "type": "int"},
-        {"name": "added_rows_count",     "type": "long"},
-        {"name": "existing_rows_count",  "type": "long"},
-        {"name": "deleted_rows_count",   "type": "long"},
-    ],
-})
-
-
-def _data_file_fields(stats_fields):
-    """data_file record fields, optionally followed by the given stats arrays."""
-    fields = [
-        {"name": "content",            "type": "int"},
-        {"name": "file_path",          "type": "string"},
-        {"name": "file_format",        "type": "string"},
-        {"name": "partition", "type": {"type": "record", "name": "r102", "fields": []}},
-        {"name": "record_count",       "type": "long"},
-        {"name": "file_size_in_bytes", "type": "long"},
-    ]
-    for i, name in enumerate(stats_fields):
-        fields.append({
-            "name": name,
-            "type": {
-                "type": "array",
-                "items": {
-                    "type": "record",
-                    "name": f"k81v81_{i}",
-                    "fields": [
-                        {"name": "key",   "type": "int"},
-                        {"name": "value", "type": "long"},
-                    ],
-                },
-            },
-        })
-    return fields
-
-
-def _manifest_entry_schema(stats_fields):
-    """Iceberg v2 manifest entry Avro schema carrying the given stats arrays."""
-    return json.dumps({
-        "type": "record",
-        "name": "manifest_entry",
-        "fields": [
-            {"name": "status",               "type": "int"},
-            {"name": "snapshot_id",          "type": ["null", "long"]},
-            {"name": "sequence_number",      "type": ["null", "long"]},
-            {"name": "file_sequence_number", "type": ["null", "long"]},
-            {
-                "name": "data_file",
-                "type": {
-                    "type": "record",
-                    "name": "r2",
-                    "fields": _data_file_fields(stats_fields),
-                },
-            },
-        ],
-    })
-
-
-# Stats-less manifest: omits every optional stats field -> columns_info is empty.
-_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR = _manifest_entry_schema([])
-# Partial stats: value_counts only (column_sizes and null_value_counts absent);
-# enough to populate columns_info so the empty-stats guard is not triggered.
-_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR = _manifest_entry_schema(["value_counts"])
-# Full stats: the common Spark-written shape.
-_MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR = _manifest_entry_schema(
-    ["column_sizes", "value_counts", "null_value_counts"]
-)
-
+_TABLE_UUID = "01234567-89ab-cdef-0123-456789abcdef"
 
 def _schema_fields(with_data):
     fields = [{"id": 1, "name": "id", "required": False, "type": "int"}]
@@ -182,27 +55,11 @@ def _create_iceberg_table(
     table_name,
     container_base,
     *,
-    manifest_entry_schema_str,
     file_has_data_column=True,
-    data_file_stats=None,
     schema_evolution=False,
 ):
     """
-    Build a minimal Iceberg v2 table under ``tmpdir/table_name``.
-
-    container_base is the absolute path inside the ClickHouse container where the
-    table is placed after upload; file paths embedded in the Avro records and
-    metadata.json must reference it because they are interpreted at query time
-    inside the container.
-
-    file_has_data_column controls whether the Parquet file physically contains
-    the ``data`` column.  data_file_stats, when given, supplies the manifest
-    entry's stats arrays (e.g. ``value_counts``).  schema_evolution lays the
-    table out with two schemas (0: ``id`` only, 1: ``id``+``data``) and two
-    snapshots so the data file resolves to the older schema id, exercising the
-    Parquet-reader fall-through that synthesizes the absent ``data`` column.
-
-    Returns the local table directory path.
+    Build a minimal stats-less Iceberg v2 table under ``tmpdir/table_name``.
     """
     table_local = os.path.join(tmpdir, table_name)
     data_local  = os.path.join(table_local, "data")
@@ -224,13 +81,11 @@ def _create_iceberg_table(
 
     ts_ms = int(time.time() * 1000)
 
-    # The manifest entry's snapshot_id is the snapshot that wrote the data file.
-    # Under schema evolution that snapshot (id 1) carries the older schema (0),
-    # so the iterator resolves the file to schema 0 while reading the table at
-    # schema 1 -- the condition that makes getInitialSchemaByPath return the
-    # file's own schema as initial_header.
+    # snapshot_id is the snapshot that wrote the file. Under schema evolution it
+    # carries the older schema (0) while the table reads at schema 1, which makes
+    # getInitialSchemaByPath return the file's own schema as initial_header.
     entry_snapshot_id = 1
-    write_schema_id   = 0  # schema the data file was written with
+    write_schema_id   = 0
 
     data_file_record = {
         "content":            0,            # DATA
@@ -240,13 +95,11 @@ def _create_iceberg_table(
         "record_count":       len(_IDS),
         "file_size_in_bytes": data_size,
     }
-    if data_file_stats:
-        data_file_record.update(data_file_stats)
 
     manifest_local_path     = os.path.join(meta_local, "00000-0-manifest.avro")
     manifest_container_path = f"{container_base}/metadata/00000-0-manifest.avro"
     _write_avro(
-        manifest_entry_schema_str,
+        MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
         [{
             "status":               1,             # ADDED
             "snapshot_id":          entry_snapshot_id,
@@ -256,7 +109,7 @@ def _create_iceberg_table(
         }],
         manifest_local_path,
         metadata={
-            # The manifest's own schema is the data file's write schema.
+            # Manifest's own schema = the data file's write schema.
             "schema":         json.dumps(_iceberg_schema(write_schema_id, file_has_data_column)),
             "partition-spec": "[]",
         },
@@ -268,7 +121,7 @@ def _write_manifest_list(snapshot_id):
         local_path     = os.path.join(meta_local, filename)
         container_path = f"{container_base}/metadata/{filename}"
         _write_avro(
-            _MANIFEST_LIST_SCHEMA_STR,
+            MANIFEST_LIST_SCHEMA_STR,
             [{
                 "manifest_path":        manifest_container_path,
                 "manifest_length":      manifest_size,
@@ -289,11 +142,9 @@ def _write_manifest_list(snapshot_id):
         return container_path
 
     if schema_evolution:
-        # Two schemas, two snapshots: the data file was written under schema 0
-        # (id only); the table later evolved to schema 1 (id + data).
+        # Written under schema 0 (id only), later evolved to schema 1 (id+data).
         schemas          = [_iceberg_schema(0, with_data=False), _iceberg_schema(1, with_data=True)]
         current_schema   = 1
-        last_column_id   = 2
         snapshots = [
             {"snapshot-id": 1, "schema-id": 0, "manifest-list": _write_manifest_list(1)},
             {"snapshot-id": 2, "schema-id": 1, "manifest-list": _write_manifest_list(2)},
@@ -301,10 +152,9 @@ def _write_manifest_list(snapshot_id):
         current_snapshot = 2
         last_seq         = 2
     else:
-        # Single schema (id + data); the data file resolves to it directly.
+        # Single schema (id+data)
         schemas          = [_iceberg_schema(0, with_data=True)]
         current_schema   = 0
-        last_column_id   = 2
         snapshots = [
             {"snapshot-id": 1, "schema-id": 0, "manifest-list": _write_manifest_list(1)},
         ]
@@ -313,11 +163,11 @@ def _write_manifest_list(snapshot_id):
 
     metadata = {
         "format-version":       2,
-        "table-uuid":           str(uuid.uuid4()),
+        "table-uuid":           _TABLE_UUID,
         "location":             container_base,
         "last-sequence-number": last_seq,
         "last-updated-ms":      ts_ms,
-        "last-column-id":       last_column_id,
+        "last-column-id":       2,
         "current-schema-id":    current_schema,
         "schemas":              schemas,
         "default-spec-id":       0,
@@ -370,47 +220,19 @@ def _query_local(instance, container_base, optimization=1):
 
 
 def test_stats_less_manifest_returns_real_data(started_cluster_iceberg_no_spark):
-    """
-    Discriminating reproducer for Altinity#1545.
-
-    A stats-less manifest (empty columns_info) must read real data, not all-NULL
-    rows.  Unpatched build: every nullable column is injected as a constant NULL
-    and the result is 3 all-NULL rows.  Fixed build: the empty-stats guard falls
-    through to the Parquet reader and returns the real values.
-    """
     instance       = started_cluster_iceberg_no_spark.instances["node1"]
     table_name     = "test_stats_less_real_" + get_uuid_str()
     container_base = _container_base(table_name)
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        local_dir = _create_iceberg_table(
-            tmpdir, table_name, container_base,
-            manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
-        )
+        local_dir = _create_iceberg_table(tmpdir, table_name, container_base)
         _upload_to_node(instance, local_dir, container_base)
 
     result = _query_local(instance, container_base)
-    assert result == _REAL_DATA_RESULT, (
-        f"Got: {result!r}\n"
-        "All-NULL rows mean the bug fired: empty columns_info caused the "
-        "optimization to treat every nullable column as absent and inject "
-        "constant NULL (need_only_count=1)."
-    )
+    assert result == _REAL_DATA_RESULT, f"Got: {result!r}"
 
 
 def test_stats_less_manifest_schema_evolution_absent_column(started_cluster_iceberg_no_spark):
-    """
-    Discriminating test for the schema-evolution fall-through (highest risk).
-
-    The Iceberg schema declares ``id``+``data`` but the Parquet file was written
-    under the older schema and physically contains only ``id``.  With the
-    stats-less manifest, the fixed build must fall through to the Parquet reader,
-    read ``id`` for real, and synthesize the absent ``data`` column as NULL via
-    IcebergMetadata::getInitialSchemaByPath (file schema as initial_header).
-
-    Unpatched build: the optimization marks BOTH columns (including ``id``)
-    constant NULL, so ``id`` comes back NULL too -- the discriminator.
-    """
     instance       = started_cluster_iceberg_no_spark.instances["node1"]
     table_name     = "test_stats_less_evolve_" + get_uuid_str()
     container_base = _container_base(table_name)
@@ -418,42 +240,27 @@ def test_stats_less_manifest_schema_evolution_absent_column(started_cluster_iceb
     with tempfile.TemporaryDirectory() as tmpdir:
         local_dir = _create_iceberg_table(
             tmpdir, table_name, container_base,
-            manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
             file_has_data_column=False,
             schema_evolution=True,
         )
         _upload_to_node(instance, local_dir, container_base)
 
     result = _query_local(instance, container_base)
-    assert result == _DATA_NULL_RESULT, (
-        f"Got: {result!r}\n"
-        "Expected real 'id' with NULL 'data'.  If 'id' is also NULL the bug "
-        "fired; if the query errors, the Parquet fall-through did not synthesize "
-        "the schema-evolved-absent column as NULL."
-    )
+    # id NULL too -> empty-stats bug fired; a query error -> the fall-through
+    # failed to synthesize the schema-evolved-absent column.
+    assert result == _DATA_NULL_RESULT, f"Got: {result!r}"
 
 
 def test_stats_less_manifest_cluster_returns_real_data(started_cluster_iceberg_no_spark):
-    """
-    Cluster/serialization-path coverage for Altinity#1545.
-
-    icebergLocalCluster distributes file-level tasks to workers, serializing
-    DataFileMetaInfo (and its empty columns_info) through the cluster JSON path.
-    An empty columns_info must survive the round-trip and still read real data on
-    the worker, exercising the same fix on the *Cluster variants.  The table is
-    uploaded to every node because local storage is read by each worker from its
-    own filesystem.
-    """
+    # Cluster path: empty columns_info must survive the DataFileMetaInfo JSON
+    # round-trip to the worker, not just the single-node read.
     cluster        = started_cluster_iceberg_no_spark
     instance       = cluster.instances["node1"]
     table_name     = "test_stats_less_cluster_" + get_uuid_str()
     container_base = _container_base(table_name)
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        local_dir = _create_iceberg_table(
-            tmpdir, table_name, container_base,
-            manifest_entry_schema_str=_MANIFEST_ENTRY_NO_STATS_SCHEMA_STR,
-        )
+        local_dir = _create_iceberg_table(tmpdir, table_name, container_base)
         for node in cluster.instances.values():
             _upload_to_node(node, local_dir, container_base)
 
@@ -462,89 +269,28 @@ def test_stats_less_manifest_cluster_returns_real_data(started_cluster_iceberg_n
         " ORDER BY id"
         " SETTINGS allow_experimental_iceberg_read_optimization=1"
     ).strip()
-    assert result == _REAL_DATA_RESULT, (
-        f"Got: {result!r}\n"
-        "All-NULL rows mean empty columns_info was mishandled on the worker after "
-        "the cluster JSON round-trip."
-    )
+    assert result == _REAL_DATA_RESULT, f"Got: {result!r}"
 
 
-def test_non_empty_stats_absent_column_still_null(started_cluster_iceberg_no_spark):
-    """
-    Non-regression guard: the optimization's legitimate absent-NULL path.
-
-    The Iceberg schema declares ``id``+``data`` but the Parquet file contains
-    only ``id``, and the manifest carries stats for ``id`` only (non-empty
-    columns_info).  The optimization must keep marking the genuinely-absent
-    ``data`` column NULL while ``id`` reads real values.  Passes on both builds;
-    the fix only changes the empty-columns_info case.
-    """
-    instance       = started_cluster_iceberg_no_spark.instances["node1"]
-    table_name     = "test_nonempty_absent_" + get_uuid_str()
-    container_base = _container_base(table_name)
-
-    # value_counts for field_id 1 (id) only -> columns_info = {id}.
-    stats = {"value_counts": [{"key": 1, "value": len(_IDS)}]}
+def test_mergetree_lasagna_roundtrip(started_cluster_iceberg_no_spark):
+    instance = started_cluster_iceberg_no_spark.instances["node1"]
+    table    = "test_lasagna_" + get_uuid_str()
 
-    with tempfile.TemporaryDirectory() as tmpdir:
-        local_dir = _create_iceberg_table(
-            tmpdir, table_name, container_base,
-            manifest_entry_schema_str=_MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR,
-            file_has_data_column=False,
-            data_file_stats=stats,
-        )
-        _upload_to_node(instance, local_dir, container_base)
-
-    result = _query_local(instance, container_base)
-    assert result == _DATA_NULL_RESULT, (
-        f"Got: {result!r}\n"
-        "With non-empty stats the optimization must still read 'id' and inject "
-        "the absent 'data' column as NULL."
+    instance.query(f"DROP TABLE IF EXISTS {table}")
+    instance.query(
+        f"CREATE TABLE {table} (id Int32, ingredient String) "
+        "ENGINE = MergeTree ORDER BY id"
     )
 
+    ingredients = [
+        "pasta", "tomato", "mozzarella", "basil",
+        "ground beef", "onion", "garlic", "parmesan", "ricotta",
+    ]
+    values = ", ".join(f"({i + 1}, '{ing}')" for i, ing in enumerate(ingredients))
+    instance.query(f"INSERT INTO {table} VALUES {values}")
 
-@pytest.mark.parametrize(
-    "manifest_entry_schema_str,stats",
-    [
-        (
-            _MANIFEST_ENTRY_PARTIAL_STATS_SCHEMA_STR,
-            {"value_counts": [{"key": 1, "value": 3}, {"key": 2, "value": 3}]},
-        ),
-        (
-            _MANIFEST_ENTRY_FULL_STATS_SCHEMA_STR,
-            {
-                "column_sizes":      [{"key": 1, "value": 64}, {"key": 2, "value": 128}],
-                "value_counts":      [{"key": 1, "value": 3}, {"key": 2, "value": 3}],
-                "null_value_counts": [{"key": 1, "value": 0}, {"key": 2, "value": 0}],
-            },
-        ),
-    ],
-    ids=["partial_stats", "full_stats"],
-)
-def test_non_empty_stats_returns_real_data(
-    started_cluster_iceberg_no_spark, manifest_entry_schema_str, stats
-):
-    """
-    Non-regression guard: manifests that do populate columns_info (partial
-    value_counts-only, and full Spark-style stats) must keep returning real data
-    when the optimization is enabled.  These are the cases the empty-stats guard
-    must not disturb.
-    """
-    instance       = started_cluster_iceberg_no_spark.instances["node1"]
-    table_name     = "test_nonempty_real_" + get_uuid_str()
-    container_base = _container_base(table_name)
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        local_dir = _create_iceberg_table(
-            tmpdir, table_name, container_base,
-            manifest_entry_schema_str=manifest_entry_schema_str,
-            data_file_stats=stats,
-        )
-        _upload_to_node(instance, local_dir, container_base)
+    result   = instance.query(f"SELECT id, ingredient FROM {table} ORDER BY id").strip()
+    expected = "\n".join(f"{i + 1}\t{ing}" for i, ing in enumerate(ingredients))
+    assert result == expected, f"Got: {result!r}"
 
-    result = _query_local(instance, container_base)
-    assert result == _REAL_DATA_RESULT, (
-        f"Got: {result!r}\n"
-        "A manifest with non-empty stats must return real data when the "
-        "optimization is enabled."
-    )
+    instance.query(f"DROP TABLE IF EXISTS {table}")